Source code for chemotools.outliers._dmodx

"""
The :mod:`chemotools.outliers._dmodx` module implements the Distance to Model (DModX) outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Union
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real
from scipy.stats import f as f_distribution


from ._base import _ModelResidualsBase, ModelTypes
from ._utils import calculate_residual_spectrum


[docs] class DModX(_ModelResidualsBase): """Calculate Distance to Model (DModX) statistics. DModX measures the distance between an observation and the model plane in the X-space, useful for detecting outliers. Parameters ---------- model : Union[ModelType, Pipeline] A fitted PCA/PLS model or Pipeline ending with such a model confidence : float, default=0.95 Confidence level for statistical calculations (between 0 and 1) mean_centered : bool, default=True Indicates if the input data was mean-centered before modeling Attributes ---------- estimator_ : ModelType The fitted model of type _BasePCA or _PLS transformer_ : Optional[Pipeline] Preprocessing steps before the model n_features_in_ : int Number of features in the input data n_components_ : int Number of components in the model n_samples_ : int Number of samples used to train the model critical_value_ : float The calculated critical value for outlier detection train_sse_: float The training sum of squared errors (SSE) for the model normalized by degrees of freedom A0_ : int Adjustment factor for degrees of freedom based on mean centering References ---------- [1] Max Bylesjö, Mattias Rantalainen, Oliver Cloarec, Johan K. Nicholson, Elaine Holmes, Johan Trygg. "OPLS discriminant analysis: combining the strengths of PLS-DA and SIMCA classification." Journal of Chemometrics 20 (8-10), 341-351 (2006). Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.outliers import DModX >>> from sklearn.decomposition import PCA >>> # Load sample data >>> X, _ = load_fermentation_train() >>> # Instantiate the PCA model >>> pca = PCA(n_components=3).fit(X) >>> # Initialize DModX with the fitted PCA model >>> dmodx = DModX(model=pca, confidence=0.95, mean_centered=True) DModX(model=PCA(n_components=3), confidence=0.95, mean_centered=True) >>> dmodx.fit(X) >>> # Predict outliers in the dataset >>> outliers = dmodx.predict(X) >>> # Calculate DModX residuals >>> residuals = dmodx.predict_residuals(X) """ _parameter_constraints: dict = { "model": [Pipeline, ModelTypes], "confidence": [Interval(Real, 0, 1, closed="both")], "mean_centered": [bool], } def __init__( self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95, mean_centered: bool = True, ) -> None: super().__init__(model, confidence) self.mean_centered = mean_centered
[docs] def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX": """ Fit the model and compute training residual variance. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data used to fit the model. y : None Ignored to align with API. Returns ------- self : DModX Fitted estimator with computed training residuals and critical value. """ X_validated = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # Process data through transformer if available X_processed = ( self.transformer_.transform(X_validated) if self.transformer_ else X_validated ) # Calculate residuals for the training set residuals = calculate_residual_spectrum(X_processed, self.estimator_) # Sum of squared residuals for the training set self.train_sse_ = np.sum(residuals**2) # Set degrees of freedom depending on mean centering self.A0_ = 1 if self.mean_centered else 0 # Compute the critical value self.critical_value_ = self._calculate_critical_value() return self
[docs] def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: """ Identify outliers in the input data. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data to predict outliers for. y : None Ignored to align with API. Returns ------- outliers : np.ndarray of shape (n_samples,) Array indicating outliers (-1) and inliers (1). """ return super().predict(X, y)
[docs] def predict_residuals( self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True ) -> np.ndarray: """ Calculate normalized DModX statistics for input data. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data to calculate DModX statistics for. y : None Ignored. validate : bool, default=True If True, validate the input data. Returns ------- dmodx_values : np.ndarray of shape (n_samples,) The normalized DModX statistics for each sample. """ # Ensure the model is fitted check_is_fitted(self, ["critical_value_"]) # Validate input data if required if validate: X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # Process data through transformer if available X_processed = self.transformer_.transform(X) if self.transformer_ else X # Calculate residuals for the input data residuals = calculate_residual_spectrum(X_processed, self.estimator_) sample_sse = np.sum(residuals**2, axis=1) # Normalize residuals per dimension residual_norm = np.sqrt(sample_sse / (self.n_features_in_ - self.n_components_)) # Scale factor based on training set residuals training_residual_scale = np.sqrt( self.train_sse_ / ( (self.n_samples_ - self.n_components_ - self.A0_) * (self.n_features_in_ - self.n_components_) ) ) return residual_norm / training_residual_scale
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float: """Calculate F-distribution based critical value.""" dof_num = self.n_features_in_ - self.n_components_ dof_den = self.n_samples_ - self.n_components_ - self.A0_ f_quantile = f_distribution.ppf(self.confidence, dof_num, dof_den) return np.sqrt(f_quantile)