Source code for chemotools.outliers._dmodx

"""
The :mod:`chemotools.outliers._dmodx` module implements the Distance to Model (DModX) outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Union
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real
from scipy.stats import f as f_distribution


from ._base import _ModelResidualsBase, ModelTypes
from ._utils import calculate_residual_spectrum



[docs]
class DModX(_ModelResidualsBase):
    """Calculate Distance to Model (DModX) statistics.

    DModX measures the distance between an observation and the model plane
    in the X-space, useful for detecting outliers.

    Parameters
    ----------
    model : Union[ModelType, Pipeline]
        A fitted PCA/PLS model or Pipeline ending with such a model

    confidence : float, default=0.95
        Confidence level for statistical calculations (between 0 and 1)

    mean_centered : bool, default=True
        Indicates if the input data was mean-centered before modeling

    Attributes
    ----------
    estimator_ : ModelType
        The fitted model of type _BasePCA or _PLS

    transformer_ : Optional[Pipeline]
        Preprocessing steps before the model

    n_features_in_ : int
        Number of features in the input data

    n_components_ : int
        Number of components in the model

    n_samples_ : int
        Number of samples used to train the model

    critical_value_ : float
        The calculated critical value for outlier detection

    train_sse_: float
        The training sum of squared errors (SSE) for the model normalized by degrees of freedom

    A0_ : int
        Adjustment factor for degrees of freedom based on mean centering

    References
    ----------
    [1] Max Bylesjö, Mattias Rantalainen, Oliver Cloarec, Johan K. Nicholson,
        Elaine Holmes, Johan Trygg.
        "OPLS discriminant analysis: combining the strengths of PLS-DA and SIMCA
        classification." Journal of Chemometrics 20 (8-10), 341-351 (2006).

    Examples
    --------
    >>> from chemotools.datasets import load_fermentation_train
    >>> from chemotools.outliers import DModX
    >>> from sklearn.decomposition import PCA
    >>> # Load sample data
    >>> X, _ = load_fermentation_train()
    >>> # Instantiate the PCA model
    >>> pca = PCA(n_components=3).fit(X)
    >>> # Initialize DModX with the fitted PCA model
    >>> dmodx = DModX(model=pca, confidence=0.95, mean_centered=True)
    DModX(model=PCA(n_components=3), confidence=0.95, mean_centered=True)
    >>> dmodx.fit(X)
    >>> # Predict outliers in the dataset
    >>> outliers = dmodx.predict(X)
    >>> # Calculate DModX residuals
    >>> residuals = dmodx.predict_residuals(X)
    """

    _parameter_constraints: dict = {
        "model": [Pipeline, ModelTypes],
        "confidence": [Interval(Real, 0, 1, closed="both")],
        "mean_centered": [bool],
    }

    def __init__(
        self,
        model: Union[ModelTypes, Pipeline],
        confidence: float = 0.95,
        mean_centered: bool = True,
    ) -> None:
        super().__init__(model, confidence)
        self.mean_centered = mean_centered


[docs]
    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
        """
        Fit the model and compute training residual variance.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features)
            The input data used to fit the model.

        y : None
            Ignored to align with API.

        Returns
        -------
        self : DModX
            Fitted estimator with computed training residuals and critical value.
        """
        X_validated = validate_data(
            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
        )

        # Process data through transformer if available
        X_processed = (
            self.transformer_.transform(X_validated)
            if self.transformer_
            else X_validated
        )

        # Calculate residuals for the training set
        residuals = calculate_residual_spectrum(X_processed, self.estimator_)

        # Sum of squared residuals for the training set
        self.train_sse_ = np.sum(residuals**2)

        # Set degrees of freedom depending on mean centering
        self.A0_ = 1 if self.mean_centered else 0

        # Compute the critical value
        self.critical_value_ = self._calculate_critical_value()

        return self



[docs]
    def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
        """
        Identify outliers in the input data.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features)
            The input data to predict outliers for.

        y : None
            Ignored to align with API.

        Returns
        -------
        outliers : np.ndarray of shape (n_samples,)
            Array indicating outliers (-1) and inliers (1).
        """
        return super().predict(X, y)



[docs]
    def predict_residuals(
        self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
    ) -> np.ndarray:
        """
        Calculate normalized DModX statistics for input data.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features)
            The input data to calculate DModX statistics for.

        y : None
            Ignored.

        validate : bool, default=True
            If True, validate the input data.

        Returns
        -------
        dmodx_values : np.ndarray of shape (n_samples,)
            The normalized DModX statistics for each sample.
        """
        # Ensure the model is fitted
        check_is_fitted(self, ["critical_value_"])

        # Validate input data if required
        if validate:
            X = validate_data(
                self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
            )

        # Process data through transformer if available
        X_processed = self.transformer_.transform(X) if self.transformer_ else X

        # Calculate residuals for the input data
        residuals = calculate_residual_spectrum(X_processed, self.estimator_)
        sample_sse = np.sum(residuals**2, axis=1)

        # Normalize residuals per dimension
        residual_norm = np.sqrt(sample_sse / (self.n_features_in_ - self.n_components_))

        # Scale factor based on training set residuals
        training_residual_scale = np.sqrt(
            self.train_sse_
            / (
                (self.n_samples_ - self.n_components_ - self.A0_)
                * (self.n_features_in_ - self.n_components_)
            )
        )

        return residual_norm / training_residual_scale


    def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
        """Calculate F-distribution based critical value."""
        dof_num = self.n_features_in_ - self.n_components_
        dof_den = self.n_samples_ - self.n_components_ - self.A0_

        f_quantile = f_distribution.ppf(self.confidence, dof_num, dof_den)
        return np.sqrt(f_quantile)