Source code for chemotools.outliers._q_residuals

"""
The :mod:`chemotools.outliers._q_residuals` module implements the Q Residuals
(Squared Prediction Error - SPE) outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Literal, Union

import numpy as np

from scipy.stats import norm, chi2
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real, StrOptions

from ._base import _ModelResidualsBase, ModelTypes
from ._utils import calculate_residual_spectrum


[docs] class QResiduals(_ModelResidualsBase): """ Calculate Q residuals (Squared Prediction Error - SPE) for PCA or PLS models. Parameters ---------- model : Union[ModelType, Pipeline] A fitted PCA/PLS model or Pipeline ending with such a model. confidence : float, default=0.95 Confidence level for statistical calculations (between 0 and 1). method : str, default="jackson-mudholkar" The method used to compute the confidence threshold for Q residuals. Options: - "chi-square" : Uses mean and standard deviation to approximate Q residuals threshold. - "jackson-mudholkar" : Uses eigenvalue-based analytical approximation. - "percentile" : Uses empirical percentile threshold. Attributes ---------- estimator_ : ModelType The fitted model of type _BasePCA or _PLS. transformer_ : Optional[Pipeline] Preprocessing steps before the model. n_features_in_ : int Number of features in the input data. n_components_ : int Number of components in the model. n_samples_ : int Number of samples used to train the model. critical_value_ : float The calculated critical value for outlier detection. Methods ------- fit(X, y=None) Fit the Q Residuals model by computing residuals from the training set. Calculates the critical threshold based on the chosen method. predict(X) Identify outliers in the input data based on Q residuals threshold. predict_residuals(X, y=None, validate=True) Calculate Q residuals (Squared Prediction Error - SPE) for input data. _calculate_critical_value(X) Calculate the critical value for outlier detection using the specified method. References ---------- [1] Johan A. Westerhuis, Stephen P. Gurden, Age K. Smilde (2001) Generalized contribution plots in multivariate statistical process monitoring Chemometrics and Intelligent Laboratory Systems 51 95–114 (2000) Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.outliers import QResiduals >>> from sklearn.decomposition import PCA >>> X, _ = load_fermentation_train() >>> pca = PCA(n_components=3).fit(X) >>> # Initialize QResiduals with the fitted PCA model >>> q_residuals = QResiduals(model=pca, confidence=0.95) >>> q_residuals.fit(X) >>> # Predict outliers in the dataset >>> outliers = q_residuals.predict(X) >>> # Calculate Q-residuals >>> q_residuals_stats = q_residuals.predict_residuals(X) """ _parameter_constraints: dict = { "model": [Pipeline, ModelTypes], "confidence": [Interval(Real, 0, 1, closed="both")], "method": [StrOptions({"chi-square", "jackson-mudholkar", "percentile"})], } def __init__( self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95, method: Literal[ "chi-square", "jackson-mudholkar", "percentile" ] = "jackson-mudholkar", ) -> None: self.model, self.confidence, self.method = model, confidence, method super().__init__(model, confidence)
[docs] def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "QResiduals": """ Fit the Q Residuals model by computing residuals from the training set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. Returns ------- self : object Fitted instance of QResiduals. """ X = validate_data(self, X, ensure_2d=True, dtype=np.float64) if self.transformer_: X = self.transformer_.fit_transform(X) # Compute the critical threshold using the chosen method self.critical_value_ = self._calculate_critical_value(X) return self
[docs] def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: """Identify outliers in the input data based on Q residuals threshold. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. Returns ------- ndarray of shape (n_samples,) Boolean array indicating outliers (-1 for outliers, 1 for normal data). """ # Check the estimator has been fitted return super().predict(X, y)
[docs] def predict_residuals( self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True ) -> np.ndarray: """Calculate Q residuals (Squared Prediction Error - SPE) for input data. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. validate : bool, default=True Whether to validate the input data. Returns ------- ndarray of shape (n_samples,) Q residuals for each sample. """ # Check the estimator has been fitted check_is_fitted(self, ["critical_value_"]) # Validate the input data if validate: X = validate_data(self, X, ensure_2d=True, dtype=np.float64) # Apply preprocessing if available if self.transformer_: X = self.transformer_.transform(X) # Compute reconstruction error (Q residuals) residual = calculate_residual_spectrum(X, self.estimator_) Q_residuals = np.sum(residual**2, axis=1) return Q_residuals
[docs] def _calculate_critical_value( self, X: np.ndarray, ) -> float: """Calculate the critical value for outlier detection. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data. X_reconstructed : array-like of shape (n_samples, n_features) Reconstructed input data. method : str Literal["chi-square", "jackson-mudholkar", "percentile"] The method used to compute the confidence threshold for Q residuals. Returns ------- float The calculated critical value for outlier detection. """ # Compute Q residuals for training data residuals = calculate_residual_spectrum(X, self.estimator_) if self.method == "chi-square": return self._chi_square_threshold(residuals) elif self.method == "jackson-mudholkar": return self._jackson_mudholkar_threshold(residuals) elif self.method == "percentile": Q_residuals = np.sum((residuals) ** 2, axis=1) return self._percentile_threshold(Q_residuals) else: raise ValueError( "Invalid method. Choose from 'chi-square', 'jackson-mudholkar', or 'percentile'." )
def _chi_square_threshold(self, residuals: np.ndarray) -> float: """Compute Q residual threshold using Chi-Square Approximation.""" eigenvalues = np.linalg.trace(np.cov(residuals.T)) theta_1 = np.sum(eigenvalues) theta_2 = np.sum(eigenvalues**2) # Degrees of freedom approximation g = theta_2 / theta_1 h = (2 * theta_1**2) / theta_2 # Compute chi-square critical value at given confidence level chi_critical = chi2.ppf(self.confidence, df=h) # Compute final Q residual threshold return g * chi_critical def _jackson_mudholkar_threshold(self, residuals: np.ndarray) -> float: """Compute Q residual threshold using Jackson & Mudholkar’s analytical method.""" eigenvalues = np.linalg.trace(np.cov(residuals.T)) theta_1 = np.sum(eigenvalues) theta_2 = np.sum(eigenvalues**2) theta_3 = np.sum(eigenvalues**3) z_alpha = norm.ppf(self.confidence) h0 = 1 - (2 * theta_1 * theta_3) / (3 * theta_2**2) term1 = theta_2 * h0 * (1 - h0) / theta_1**2 term2 = np.sqrt(z_alpha * 2 * theta_2 * h0**2) / theta_1 return theta_1 * (1 - term1 + term2) ** (1 / h0) def _percentile_threshold(self, Q_residuals: np.ndarray) -> float: """Compute Q residual threshold using the empirical percentile method.""" return np.percentile(Q_residuals, self.confidence * 100)