Source code for chemotools.feature_selection._sr_selector

"""
The :mod:`chemotools.feature_selection._sr_selector` module implements the Selectivity Ratio (SR)
feature selector for PLS regression models.
"""

# Author: Pau Cabaneros
# License: MIT

import numpy as np
from sklearn.utils.validation import validate_data

from ._base import _PLSFeatureSelectorBase


[docs] class SRSelector(_PLSFeatureSelectorBase): """ This selector is used to select features that contribute significantly to the latent variables in a PLS regression model using the Selectivity Ratio (SR) method. Parameters ---------- model : Union[_PLS, Pipeline] The PLS regression model or a pipeline with a PLS regression model as last step. threshold : float, default=1.0 The threshold for feature selection. Features with importance above this threshold will be selected. Attributes ---------- estimator_ : ModelTypes The fitted model of type _BasePCA or _PLS feature_scores_ : np.ndarray The calculated feature scores based on the selected method. support_mask_ : np.ndarray The boolean mask indicating which features are selected. References ---------- [1] Kim H. Esbensen, "Multivariate Data Analysis - In Practice", 5th Edition, 2002. Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.feature_selection import SRSelector >>> from sklearn.cross_decomposition import PLSRegression >>> # Load sample data >>> X, y = load_fermentation_train() >>> # Instantiate the PLS regression model >>> pls_model = PLSRegression(n_components=2).fit(X, y) >>> # Instantiate the SR selector with the PLS model >>> selector = SRSelector(model=pls_model, threshold=0.9) >>> selector.fit(X) SRSelector(model=PLSRegression(n_components=2), threshold=0.9) >>> # Get the selected features >>> X_selected = selector.transform(X) >>> X_selected.shape (21, 978) """ def __init__( self, model, threshold: float = 1.0, ): self.model = model self.threshold = threshold super().__init__(self.model)
[docs] def fit(self, X: np.ndarray, y=None) -> "SRSelector": """ Fit the transformer to calculate the feature scores and the support mask. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to fit the transformer to. y : None Ignored to align with API. Returns ------- self : SRSelector The fitted transformer. """ # Check that X is a 2D array and has only finite values X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # Calculate the SR scores self.feature_scores_ = self._calculate_features(X) # Calculate the support mask self.support_mask_ = self._get_support_mask() return self
def _get_support_mask(self) -> np.ndarray: """ Get the support mask based on the feature scores and threshold. Features with scores above the threshold are selected. Parameters ---------- self : SRSelector The fitted transformer. Returns ------- support_mask_ : np.ndarray The boolean mask indicating which features are selected. """ return self.feature_scores_ > self.threshold def _calculate_features(self, X: np.ndarray) -> np.ndarray: """ Vectorized Selectivity Ratio calculation from a fitted _PLS like model. Parameters: ---------- - self: SRSelector The fitted transformer. - X: array-like of shape (n_samples, n_features) The input training data to calculate the feature scores from. Returns ------- feature_scores_ : np.ndarray The calculated feature scores based on the selected method. """ bpls = self.estimator_.coef_ bpls_norm = bpls.T / np.linalg.norm(bpls) # Handle 1D case correctly if bpls.ndim == 1: bpls_norm = bpls_norm.reshape(-1, 1) # Project X onto the regression vector ttp = X @ bpls_norm ptp = X.T @ np.linalg.pinv(ttp).T # Predicted part of X X_hat = ttp @ ptp.T # Compute squared norms directly total_ss = np.linalg.norm(X, axis=0) ** 2 explained_ss = np.linalg.norm(X_hat, axis=0) ** 2 # Calculate residual sum of squares residual_ss = total_ss - explained_ss # Stability: avoid division by zero epsilon = 1e-12 # Calculate Selectivity Ratio return explained_ss / (residual_ss + epsilon)