Source code for chemotools.outliers._dmodx

"""
The :mod:`chemotools.outliers._dmodx` module implements
the Distance to Model (DModX) outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Union

import numpy as np
from scipy.stats import f as f_distribution
from sklearn.pipeline import Pipeline

from ._base import ModelTypes, _ModelResidualsBase
from ._utils import calculate_residual_spectrum


[docs] class DModX(_ModelResidualsBase): """Calculate Distance to Model (DModX) statistics. DModX measures the distance between an observation and the model plane in the X-space, useful for detecting outliers. Parameters ---------- model : Union[ModelType, Pipeline] A fitted PCA/PLS model or Pipeline ending with such a model confidence : float, default=0.95 Confidence level for statistical calculations (between 0 and 1) mean_centered : bool, default=True Indicates if the input data was mean-centered before modeling Attributes ---------- estimator_ : ModelType The fitted model of type _BasePCA or _PLS transformer_ : Optional[Pipeline] Preprocessing steps before the model n_features_in_ : int Number of features in the input data n_components_ : int Number of components in the model n_samples_ : int Number of samples used to train the model critical_value_ : float The calculated critical value for outlier detection train_sse_: float The training sum of squared errors (SSE) for the model normalized by degrees of freedom A0_ : int Adjustment factor for degrees of freedom based on mean centering References ---------- [1] Max Bylesjö, Mattias Rantalainen, Oliver Cloarec, Johan K. Nicholson, Elaine Holmes, Johan Trygg. "OPLS discriminant analysis: combining the strengths of PLS-DA and SIMCA classification." Journal of Chemometrics 20 (8-10), 341-351 (2006). Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.outliers import DModX >>> from sklearn.decomposition import PCA >>> # Load sample data >>> X, _ = load_fermentation_train() >>> # Instantiate the PCA model >>> pca = PCA(n_components=3).fit(X) >>> # Initialize DModX with the fitted PCA model >>> dmodx = DModX(model=pca, confidence=0.95, mean_centered=True) DModX(model=PCA(n_components=3), confidence=0.95, mean_centered=True) >>> dmodx.fit(X) >>> # Predict outliers in the dataset >>> outliers = dmodx.predict(X) >>> # Calculate DModX residuals >>> residuals = dmodx.predict_residuals(X) """ _parameter_constraints: dict = { **_ModelResidualsBase._parameter_constraints, "mean_centered": [bool], } def __init__( self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95, mean_centered: bool = True, ) -> None: super().__init__(model, confidence) self.mean_centered = mean_centered def _fit_residuals(self, X: np.ndarray, y: Optional[np.ndarray]) -> None: """Compute training residual variance and critical value.""" residuals = calculate_residual_spectrum(X, self.estimator_) # Sum of squared residuals for the training set self.train_sse_ = np.sum(residuals**2) # Set degrees of freedom adjustment for mean centering self.A0_ = 1 if self.mean_centered else 0 # Calculate degrees of freedom terms # K - A (Variables - Components) dof_vars = self.n_features_in_ - self.n_components_ # N - A - A0 (Samples - Components - Centering) dof_samples = self.n_samples_ - self.n_components_ - self.A0_ # 1. Numerator DoF: Degrees of freedom for the specific sample being tested dof_num = dof_vars # 2. Denominator DoF: Degrees of freedom for the pooled model variance # CORRECTION: We must multiply samples DoF by variable DoF dof_den = dof_samples * dof_vars # Compute the critical value using F-distribution f_quantile = f_distribution.ppf(self.confidence_, dof_num, dof_den) self.critical_value_ = np.sqrt(f_quantile) def _compute_residuals(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray: """Calculate normalized DModX statistics for input data.""" residuals = calculate_residual_spectrum(X, self.estimator_) sample_sse = np.sum(residuals**2, axis=1) dof_vars = self.n_features_in_ - self.n_components_ dof_samples = self.n_samples_ - self.n_components_ - self.A0_ # Variance of the specific sample (s_i^2) sample_variance = sample_sse / dof_vars # Pooled variance of the model (s_0^2) # CORRECTION: Ensure this matches the dof_den logic above model_variance = self.train_sse_ / (dof_samples * dof_vars) # The DModX statistic is the ratio of standard deviations return np.sqrt(sample_variance / model_variance)