Source code for chemotools.outliers._hotelling_t2

"""
The :mod:`chemotools.outliers._hotelling_t2` module implements Hotelling's T-squared
outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Union
import numpy as np

from sklearn.cross_decomposition._pls import _PLS
from sklearn.decomposition._base import _BasePCA
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real

from scipy.stats import f as f_distribution

from ._base import _ModelResidualsBase, ModelTypes


[docs] class HotellingT2(_ModelResidualsBase): """ Calculate Hotelling's T-squared statistics for PCA or PLS like models. Parameters ---------- model : Union[ModelType, Pipeline] A fitted PCA/PLS model or Pipeline ending with such a model confidence : float, default=0.95 Confidence level for statistical calculations (between 0 and 1) Attributes ---------- estimator_ : ModelType The fitted model of type _BasePCA or _PLS transformer_ : Optional[Pipeline] Preprocessing steps before the model n_features_in_ : int Number of features in the input data n_components_ : int Number of components in the model n_samples_ : int Number of samples used to train the model critical_value_ : float The calculated critical value for outlier detection References ---------- [1] Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde Generalized contribution plots in multivariate statistical process monitoring Chemometrics and Intelligent Laboratory Systems 51 2000 95–114 (2001). Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.outliers import HotellingT2 >>> from sklearn.decomposition import PCA >>> # Load sample data >>> X, _ = load_fermentation_train() >>> # Instantiate the PCA model >>> pca = PCA(n_components=3).fit(X) >>> # Initialize HotellingT2 with the fitted PCA model >>> hotelling_t2 = HotellingT2(model=pca, confidence=0.95) HotellingT2(model=PCA(n_components=3), confidence=0.95) >>> hotelling_t2.fit(X) >>> # Predict outliers in the dataset >>> outliers = hotelling_t2.predict(X) >>> # Calculate Hotelling's T-squared statistics >>> t2_stats = hotelling_t2.predict_residuals(X) """ _parameter_constraints = { "model": [Pipeline, ModelTypes], "confidence": [Interval(Real, 0, 1, closed="both")], } def __init__( self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95 ) -> None: self.model, self.confidence = model, confidence super().__init__(model, confidence)
[docs] def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "HotellingT2": """ Fit the model to the input data. This step calculates the critical value for the outlier detection. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data y : Ignored Not used, present for API consistency by convention. Returns ------- self : HotellingT2 Fitted estimator with the critical threshold computed """ X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) self.critical_value_ = self._calculate_critical_value() return self
[docs] def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: """Identify outliers in the input data. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data y : None Ignored to align with API. Returns ------- ndarray of shape (n_samples,) Boolean array indicating outliers (-1) and inliers (1) """ return super().predict(X, y)
[docs] def predict_residuals( self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True ) -> np.ndarray: """Calculate Hotelling's T-squared statistics for input data. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data y : None Ignored. Returns ------- ndarray of shape (n_samples,) Hotelling's T-squared statistics for each sample """ # Check the estimator has been fitted check_is_fitted(self, ["critical_value_"]) # Validate the input data if validate: X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # Apply preprocessing steps if self.transformer_: X = self.transformer_.transform(X) # Calculate the Hotelling's T-squared statistics if isinstance(self.estimator_, _BasePCA): # For PCA-like models variances = self.estimator_.explained_variance_ if isinstance(self.estimator_, _PLS): # For PLS-like models variances = np.var(self.estimator_.x_scores_, axis=0) # Equivalent to X @ model.components_.T for _BasePCA and X @ model.x_rotations_ for _PLS X_transformed = self.estimator_.transform(X) return np.sum((X_transformed**2) / variances, axis=1)
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float: """ Calculate the critical value for the Hotelling's T-squared statistics. Parameters ---------- X : Ignored Not used, present for API consistency by convention. Returns ------- float The critical value for the Hotelling's T-squared statistics """ critical_value = f_distribution.ppf( self.confidence, self.n_components_, self.n_samples_ - self.n_components_ ) return ( critical_value * self.n_components_ * (self.n_samples_ - 1) / (self.n_samples_ - self.n_components_) )