Source code for chemotools.feature_selection._index_selector

"""
The :mod:`chemotools.feature_selection._index_selector` module implements the IndexSelector
to select specific features from spectral data based on indices or wavenumbers.
"""

# Author: Pau Cabaneros
# License: MIT

from typing import Optional, Union

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin

from sklearn.utils.validation import check_is_fitted, validate_data


[docs] class IndexSelector(SelectorMixin, BaseEstimator): """ A transformer that Selects the spectral data to a specified array of features. This array can be continuous or discontinuous. The array of features is specified by: - by the indices of the wavenumbers to select, - by the wavenumbers to select, the wavenumbers must be provided to the transformer when it is initialised. If the wavenumbers are not provided, the indices will be used instead. The wavenumbers must be provided in ascending order. Parameters ---------- features : narray-like, optional, default=None The index of the features to select. Default is None. wavenumbers : array-like, optional, default=None The wavenumbers of the input data. If not provided, the indices will be used instead. Default is None. If provided, the wavenumbers must be provided in ascending order. Attributes ---------- features_index_ : int The index of the features to select. Examples -------- >>> import numpy as np >>> from chemotools.feature_selection import IndexSelector >>> from chemotools.datasets import load_fermentation_train >>> # Load sample data >>> X, _ = load_fermentation_train() >>> # Get wavenumbers as numpy array >>> wavenumbers = X.columns.values array([ 428., 429., 431., ..., 1830., 1831., 1833.], shape=(1047,)) >>> # Define features to select >>> range_1 = np.arange(428, 551, 1) >>> range_2 = np.arange(875, 1001, 1) >>> features = np.concatenate((range_1, range_2)) >>> # Instantiate the transformer >>> selector = IndexSelector(features=features, wavenumbers=wavenumbers) IndexSelector() >>> selector.fit(X) >>> # Transform the data >>> X_selected = selector.transform(X) >>> X_selected.shape (21, 183) """ def __init__( self, features: Optional[np.ndarray] = None, wavenumbers: Optional[np.ndarray] = None, ): self.features = features self.wavenumbers = wavenumbers
[docs] def fit(self, X: np.ndarray, y=None) -> "IndexSelector": """ Fit the transformer to the input data. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to fit the transformer to. y : None Ignored to align with API. Returns ------- self : IndexSelector The fitted transformer. """ # validate that X is a 2D array and has only finite values X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # Set the fitted attribute to True self._is_fitted = True # Set the start and end indices if self.features is None: self.features_index_ = self.features return self elif self.wavenumbers is None: self.features_index_ = self.features return self else: self.features_index_ = self._find_indices(self.features) return self
def _get_support_mask(self): """ Get the boolean mask indicating which features are selected. Returns ------- mask : ndarray of shape (n_features_in_,) The mask indicating the selected features. """ # Check that the estimator is fitted check_is_fitted(self) # Create the mask mask = np.zeros(self.n_features_in_, dtype=bool) mask[self.features_index_] = True return mask def _find_index(self, target: Union[float, int]) -> int: if self.wavenumbers is None: return int(target) wavenumbers = np.array(self.wavenumbers) return int(np.argmin(np.abs(wavenumbers - target))) def _find_indices(self, features: np.ndarray) -> np.ndarray: return np.array([self._find_index(feature) for feature in features])