Source code for chemotools.feature_selection._range_cut
"""
The :mod:`chemotools.feature_selection._range_cut` module implements the RangeCut
to select specific features from spectral data based on start and end indices or
wavenumbers.
"""
from typing import Optional
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin
from sklearn.utils.validation import check_is_fitted, validate_data
[docs]
class RangeCut(SelectorMixin, BaseEstimator):
"""Select a contiguous spectral region by index or by wavenumber.
The range can be specified in two ways:
* By integer indices (``start`` and ``end``)
* By wavenumber values (``start`` and ``end`` interpreted against the
provided ``wavenumbers`` array)
If ``wavenumbers`` is supplied, the closest indices to the given start / end
wavenumber values are located. Otherwise numeric ``start`` / ``end`` are
treated directly as indices. Wavenumbers must be in ascending order.
Parameters
----------
start : int, default=0
Index or wavenumber of the start of the range.
end : int, default=-1
Index or wavenumber of the end of the range.
wavenumbers : array-like, optional
Wavenumbers corresponding to columns. Must be ascending if provided.
Attributes
----------
start_index_ : int
Resolved start index.
end_index_ : int
Resolved end index.
wavenumbers_ : array-like or None
Selected wavenumbers (if provided), else ``None``.
Examples
--------
>>> from chemotools.feature_selection import RangeCut
>>> from chemotools.datasets import load_fermentation_train
>>> X, _ = load_fermentation_train()
>>> wavenumbers = X.columns.values
>>> rc = RangeCut(start=1000, end=2000, wavenumbers=wavenumbers)
>>> rc.fit(X)
RangeCut(start=1000, end=2000, wavenumbers=wavenumbers)
>>> X_cut = rc.transform(X)
>>> X_cut.shape
(21, 616)
"""
def __init__(
self,
start: int = 0,
end: int = -1,
wavenumbers: Optional[np.ndarray] = None,
):
self.start = start
self.end = end
self.wavenumbers = wavenumbers
[docs]
def fit(self, X: np.ndarray, y=None) -> "RangeCut":
"""
Fit the transformer to the input data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data to fit the transformer to.
y : None
Ignored to align with API.
Returns
-------
self : RangeCut
The fitted transformer.
"""
# Check that X is a 2D array and has only finite values
X = validate_data(
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
)
# Set the start and end indices
if self.wavenumbers is None:
self.start_index_ = self.start
self.end_index_ = self.end
self.wavenumbers_ = None
else:
self.start_index_ = self._find_index(self.start)
self.end_index_ = self._find_index(self.end)
self.wavenumbers_ = self.wavenumbers[self.start_index_ : self.end_index_]
return self
def _get_support_mask(self):
"""
Get the boolean mask indicating which features are selected.
Returns
-------
mask : np.ndarray of shape (n_features,)
The boolean mask indicating which features are selected.
"""
# Check that the estimator is fitted
check_is_fitted(self, ["start_index_", "end_index_"])
# Create the mask
mask = np.zeros(self.n_features_in_, dtype=bool)
mask[self.start_index_ : self.end_index_] = True
return mask
def _find_index(self, target: float) -> int:
wavenumbers = np.array(self.wavenumbers)
return int(np.argmin(np.abs(wavenumbers - target)))