Source code for mlcolvar.explain.lasso

import numpy as np
import torch

import matplotlib 
import matplotlib.pyplot as plt

try:
    import sklearn
except ImportError:
    print('The lasso module requires scikit-learn as additional dependency.')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score

__all__ = [ "lasso_classification", "plot_lasso_classification", "lasso_regression", "plot_lasso_regression" ]

class SparsityScoring:
    """Scorer function used as metric in lasso_classification. 
    The score balances the accuracy of the predictions and the number of features n:
    .. math:: \text{score} = - (1-\text{accuracy})*100 - \left|n-\text{min\_features}\right| 

    this implies that a feature is retained only if increases the accuracy by 1%.
    """

    def __init__(self, min_features=0 ):
        """Initialize scorer with (optional) number of features

        Parameters
        ----------
        min_features : int, optional
            minimum number of features, by default 0
        """
        self.min_features = min_features

    def __call__(self, estimator, X, y, **kwargs):
        y_pred = estimator.predict(X)
        # Accuracy
        #acc = accuracy_score(y, y_pred)
        acc = balanced_accuracy_score(y, y_pred)
        # Number of features
        coefficents = estimator.coef_
        num = np.count_nonzero(coefficents)

        # criterion
        loss = (1-acc)*100 + np.abs(num-self.min_features)
        return -1*loss
    
    def accuracy_from_score(self, score, num_features):
        loss = -1*score
        err = loss - np.abs(num_features - self.min_features)
        acc = 1 - err/100
        return acc

[docs] def lasso_classification(dataset, min_features = 0, Cs = 40, scale_inputs = True, feature_names = None, print_info = True, plot = True ): """Perform sparse classification via LASSO on a given DictDataset (requires keys: "data" and "labels"). The (inverse) regularization strength C is automatically chosen based on cross-validation on a set of values (Cs), see sklearn.linear_model.LogisticRegressionCV. The scoring function used is `SparsityScoring`, balancing the accuracy and the number of features. In the two-classes case a single classifier is built, otherwise a one-vs-rest classifier is constructed, composed by N different estimators are trained to classify each state from the others. Parameters ---------- dataset : DictDataset dataset with 'data' and 'labels' min_features : int, optional minimum number of features, by default 0 Cs : int or array-like, optional Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization., by default 40 scale_inputs : bool, optional whether to standardize inputs based on mean and std.dev., by default True feature_names : list, optional names of the input features, if not given they are taken from `dataset.feature_names`, by default None print_info : bool, optional whether to print results, by default True plot : bool, optional whether to plot results, by default True See also -------- mlcolvar.explain.lasso.SparsityScoring Scoring function used in LASSO classification Returns ------- classifier: optimized estimator feats: dictionary with the names of the non-zero features, per label coeffs: dictionary with the coefficients of the non-zero features, per label """ # Convert dataset to numpy with torch.no_grad(): raw_descriptors = dataset['data'].numpy() labels = dataset['labels'].numpy().astype(int) if feature_names is None: if dataset.feature_names is None: raise ValueError('Feature names not found (either in the dataset or as argument to the function).') feature_names = dataset.feature_names # Scaling inputs if scale_inputs: scaler = StandardScaler(with_mean=True, with_std=True) descriptors = scaler.fit_transform(raw_descriptors) else: descriptors = raw_descriptors # Define cross-validation for LASSO, using # a custom scoring function based on accuracy and number of features scorer = SparsityScoring(min_features=min_features) _classifier = LogisticRegressionCV(Cs=Cs, solver='liblinear', multi_class='ovr', fit_intercept=False, penalty='l1', max_iter = 500, scoring=scorer) # Fit classifier feature_selector = SelectFromModel(_classifier) feature_selector.fit(descriptors, labels) classifier = feature_selector.estimator_ # Get selected features and coefficients feats = {} coeffs = {} for i,key in enumerate(classifier.coefs_paths_.keys()): index = np.abs(classifier.coef_).argsort()[i][::-1] sorted_feature_names = feature_names[index] sorted_coeffs = classifier.coef_[i,index] idx = np.argwhere ( np.abs(sorted_coeffs)>1e-5 )[:,0] selected_feature_names = sorted_feature_names[idx] selected_coeffs = sorted_coeffs[idx] feats[key] = selected_feature_names coeffs[key] = selected_coeffs # display summary if print_info: #score = classifier.score(descriptors,labels) C_idx = np.argwhere(np.abs(classifier.Cs_ - classifier.C_[i]) < 1e-8)[0,0] score = classifier.scores_[key].mean(axis=0)[C_idx] accuracy = classifier.scoring.accuracy_from_score(score, len(selected_coeffs)) print(f'======= LASSO results ({key}) ========') print(f'- Regularization : {classifier.C_[i]:.8f}') print(f'- Score : {score:.2f}') print(f'- Accuracy : {accuracy*100:.2f}%') print(f'- # features : {len(selected_coeffs)}\n') print(f'Features: ') for j,(f,c) in enumerate(zip(selected_feature_names, selected_coeffs)): print(f'({j+1}) {f:13s}: {c:.6f}') print('==================================\n') # plot results if plot: plot_lasso_classification(classifier, feats, coeffs) return classifier, feats, coeffs
[docs] def plot_lasso_classification(classifier, feats = None, coeffs = None, draw_labels='auto', axs = None): """Plot results of the LASSO classification.""" # check that the tested regularization values are more than 1 if len(classifier.Cs_) == 1: print('Plotting is not available, as the regressor has been optimized with a single regularization value.') return # get number of classifiers (1 if n_states = 2, otherwise equal to n_states) n_models = len(classifier.C_) # define figure if axs is None: init_axs = True _, axs = plt.subplots(3, n_models, figsize=(6*n_models, 9), sharex=True) plt.suptitle('LASSO CLASSIFICATION') else: init_axs = False for i,key in enumerate(classifier.scores_.keys()): # (1) COEFFICIENTS PATH ax = axs[0] if n_models == 1 else axs[0][i] c = 'black' ax.plot(classifier.Cs_, np.mean(classifier.coefs_paths_[key], axis=0),color=c,alpha=0.6) ax.set_xscale('log') ax.set_title(f'Coefficients path ({key})') ax.set_xmargin(0) ax.set_ylabel('Coefficients',color=c) values = np.asarray( [np.mean(c, axis=0) for c in classifier.coefs_paths_.values() ]) ax.set_ylim(values.min(),values.max()) if draw_labels == 'auto': draw_labels = False if feats is not None and coeffs is not None: draw_labels = True if len(feats[key])<= 3 else False if draw_labels: if feats is None or coeffs is None: raise ValueError('To draw the names of the features one need to pass both the selected features (`feats`) an coefficients (`coeffs`).') for name, coef in zip(feats[key],coeffs[key]): ax.text(classifier.C_[i], coef, name, fontsize='small', horizontalalignment='center', bbox=dict(facecolor='white', alpha=0.6)) # (2) ACCURACY AND SCORE ax = axs[1] if n_models == 1 else axs[1][i] c = 'fessa6' score_path = classifier.scores_[key].mean(axis=0) ax.plot(classifier.Cs_, score_path, color=c,alpha=0.9) ax.set_title('Score') if i == 0: ax.set_ylabel('Score',color=c) ax.set_ylim(-50,5) ax2 = ax.twinx() c = 'fessa5' num_features_path = np.mean(np.count_nonzero(classifier.coefs_paths_[key], axis = -1), axis= 0) ax2.plot(classifier.Cs_, classifier.scoring.accuracy_from_score(score_path, num_features_path), color=c, linestyle='-.',alpha=0.9) if i == n_models-1: ax2.set_ylabel('Accuracy',color=c) ax2.set_ylim(0.5,1.05) # (3) NUMBER OF FEATURES ax = axs[2] if n_models == 1 else axs[2][i] c = 'fessa0' ax.plot(classifier.Cs_, num_features_path, color=c) ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True)) ax.set_title('Number of features') ax.set_xlabel('Regularization strength'+r'$^{-1}$') ax.set_ylabel('# features',color=c) ax.set_ylim(0,None) # selected regularization value axs_i = axs if n_models == 1 else axs[:,i] for ax in axs_i: ax.axvline(classifier.C_[i],color='gray',linestyle='dotted') ax.set_xmargin(0) if init_axs: matplotlib.pyplot.tight_layout()
[docs] def lasso_regression(dataset, alphas = None, scale_inputs = True, print_info = True, plot = True, ): """Perform sparse regression via LASSO on a given DictDataset (requires keys: "data" and "target"). The regularization strength alpha is automatically chosen based on cross-validation on a set of values (alphas), see sklearn.linear_model.LassoCV. The scoring function used is the MSE loss + alpha * L1 regularization. Parameters ---------- dataset : DictDataset dataset with 'data' and 'target' alphas : int or array-like, optional List of alphas where to compute the models. If None alphas are set automatically. scale_inputs : bool, optional whether to standardize inputs based on mean and std.dev., by default True feature_names : list, optional names of the input features, if not given they are taken from `dataset.feature_names`, by default None print_info : bool, optional whether to print results, by default True plot : bool, optional whether to plot results, by default True Returns ------- regressor: optimized estimator feats: names of the non-zero features coeffs: coefficients of the non-zero features """ # Convert dataset to numpy with torch.no_grad(): raw_descriptors = dataset['data'].numpy() target = dataset['target'].numpy() feature_names = dataset.feature_names # Check target shape if (target.ndim > 1) and (target.shape[1] != 1): raise ValueError(f'Lasso Regressor only accepts scalar targets, while an array of shape {target.shape} is given.') # Scaling inputs if scale_inputs: scaler = StandardScaler(with_mean=True, with_std=True) descriptors = scaler.fit_transform(raw_descriptors) else: descriptors = raw_descriptors # Define Cross-validation & fit _regressor = LassoCV(alphas=alphas) feature_selector = SelectFromModel(_regressor) feature_selector.fit(descriptors, np.squeeze(target)) regressor = feature_selector.estimator_ # Save coefficients path _, coefs_paths, dual_gaps = regressor.path(descriptors, target, alphas=regressor.alphas_) regressor.coefs_paths_ = coefs_paths # Get selected features and coefficients index = np.abs(regressor.coef_).argsort()[::-1] sorted_feature_names = feature_names[index] sorted_coeffs = regressor.coef_[index] idx = np.argwhere ( np.abs(sorted_coeffs)>1e-5 )[:,0] selected_feature_names = sorted_feature_names[idx] selected_coeffs = sorted_coeffs[idx] # display summary if print_info: score = regressor.score(descriptors,target) print(f'========= LASSO results ==========') print(f'- Regularization : {regressor.alpha_:.8f}') print(f'- Score : {score:.8f}') print(f'- # features : {len(selected_coeffs)}') print('\n======= Relevant features =======') for i,(f,c) in enumerate(zip(selected_feature_names, selected_coeffs)): print(f'({i+1}) {f:13s}: {c:.6f}') print('=================================') # plot results if plot: plot_lasso_regression(regressor, selected_feature_names, selected_coeffs) return regressor, selected_feature_names, selected_coeffs
[docs] def plot_lasso_regression(regressor, feats = None, coeffs = None, draw_labels='auto', axs = None): """Plot the results of the LASSO regression.""" # check that the tested regularization values are more than 1 if len(regressor.alphas_) == 1: print('Plotting is not available, as the regressor has been optimized with a single regularization value.') return # define figure if axs is None: fig, axs = plt.subplots(3, 1, figsize=(6, 9), sharex=True) plt.suptitle('LASSO REGRESSION') init_axs = True else: init_axs = False # (1) COEFFICIENTS PATH ax = axs[0] c = 'black' alphas = regressor.alphas_ coefs_paths = regressor.coefs_paths_.T axs[0].plot(alphas, coefs_paths,color=c,alpha=0.6) ax.set_xscale('log') ax.set_title('Coefficients path') ax.set_xmargin(0) ax.set_ylabel('Coefficients',color=c) if draw_labels == 'auto': draw_labels = False if feats is not None and coeffs is not None: draw_labels = True if len(feats)<= 3 else False if draw_labels: if feats is None or coeffs is None: raise ValueError('To draw the labels one need to pass both the selected features (`feats`) an coefficients (`coeffs`).') for name, coef in zip(feats,coeffs): ax.text(regressor.alpha_, coef, name, fontsize='small', horizontalalignment='center', bbox=dict(facecolor='white', alpha=0.6)) # (2) ACCURACY AND SCORE ax = axs[1] c = 'fessa6' mse_path = regressor.mse_path_.mean(axis=1) ax.plot(alphas, mse_path, color=c,alpha=0.9) ax.set_title('Score') ax.set_ylabel('MSE',color=c) # (3) NUMBER OF FEATURES ax = axs[2] c = 'fessa0' num_features_path = np.count_nonzero(coefs_paths, axis = 1) ax.plot(alphas, num_features_path, color=c) ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True)) ax.set_title('Number of features') ax.set_xlabel('Regularization strength') ax.set_ylabel('# features',color=c) # selected regularization value for ax in axs: ax.axvline(regressor.alpha_,color='gray',linestyle='dotted') if init_axs: matplotlib.pyplot.tight_layout()
def test_lasso_classification(): from mlcolvar.data import DictDataset for n_states in [2,3]: # create dataset samples = 50 X = torch.randn((samples*2, 2)) # create labels y = torch.zeros(samples*2) for i in range(1, n_states): y[samples * i :] += 1 dataset = DictDataset({"data": X, "labels": y}) dataset.feature_names = ['x1','x2'] for Cs in [ 40, [0.1], np.logspace(-4,0,10) ]: for min_features in [0,1]: _ = lasso_classification(dataset, min_features = min_features, Cs = Cs, scale_inputs = True, feature_names = None, print_info = False, plot = False ) def test_lasso_regression(): from mlcolvar.data import DictDataset # create dataset samples = 50 X = torch.randn((samples, 2)) y = torch.randn(samples) dataset = DictDataset({"data": X, "target": y}) dataset.feature_names = ['x1','x2'] for alphas in [ None, [0.1], np.logspace(-4,0,10) ]: _ = lasso_regression(dataset, alphas = alphas, scale_inputs = True, print_info = False, plot = False)