Source code for chemotools.outliers._leverage

"""
The :mod:`chemotools.outliers._leverage` module implements the Leverage
outlier detection algorithm.
"""

# Authors: Pau Cabaneros
# License: MIT

from typing import Optional, Union
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real


from ._base import _ModelResidualsBase, ModelTypes



[docs]
class Leverage(_ModelResidualsBase):
    """
    Calculate the leverage of the training samples on  the latent space of a PLS model.
    This method allows to detect datapoints with high leverage in the model.

    Parameters
    ----------
    model : Union[ModelType, Pipeline]
        A fitted PLSRegression model or Pipeline ending with such a model

    confidence : float, default=0.95
        Confidence level for statistical calculations (between 0 and 1)

    Attributes
    ----------
    estimator_ : ModelType
        The fitted model of type _PLS

    transformer_ : Optional[Pipeline]
        Preprocessing steps before the model

    n_features_in_ : int
        Number of features in the input data

    n_components_ : int
        Number of components in the model

    n_samples_ : int
        Number of samples used to train the model

    critical_value_ : float
        The calculated critical value for outlier detection

    References
    ----------
    [1] Kim H. Esbensen,
        "Multivariate Data Analysis - In Practice", 5th Edition, 2002.

    Examples
    --------
    >>> from sklearn.cross_decomposition import PLSRegression
    >>> from chemotools.outliers import Leverage
    >>> X = np.random.rand(100, 10)
    >>> y = np.random.rand(100)
    >>> pls = PLSRegression(n_components=3).fit(X, y)
    >>> # Initialize Leverage with the fitted PLS model
    >>> leverage = Leverage(pls, confidence=0.95)
    Leverage(model=PLSRegression(n_components=3), confidence=0.95)
    >>> leverage.fit(X, y)
    >>> # Predict outliers in the dataset
    >>> outliers = leverage.predict(X)
    >>> # Get the leverage of the samples
    >>> residuals = leverage.predict_residuals(X)
    """

    _parameter_constraints: dict = {
        "model": [Pipeline, ModelTypes],
        "confidence": [Interval(Real, 0, 1, closed="both")],
    }

    def __init__(
        self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
    ) -> None:
        model, confidence = model, confidence
        super().__init__(model, confidence)


[docs]
    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
        """
        Fit the model to the input data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data

        y : array-like of shape (n_samples,), default=None
            Target data

        Returns
        -------
        self : Leverage
            Fitted estimator with the critical threshold computed
        """
        X = validate_data(
            self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
        )

        if self.transformer_:
            X = self.transformer_.fit_transform(X)

        # Compute the critical threshold
        self.critical_value_ = self._calculate_critical_value(X)

        return self



[docs]
    def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
        """Calculate Leverage for training data on the model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data

        Returns
        -------
        ndarray of shape (n_samples,)
            Bool with samples with a leverage above the critical value
        """
        return super().predict(X, y)



[docs]
    def predict_residuals(
        self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
    ) -> np.ndarray:
        """Calculate the leverage of the samples.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data

        Returns
        -------
        np.ndarray
            Leverage of the samples
        """
        # Check the estimator has been fitted
        check_is_fitted(self, ["critical_value_"])

        # Validate the input data
        if validate:
            X = validate_data(self, X, ensure_2d=True, dtype=np.float64)

        # Apply preprocessing if available
        if self.transformer_:
            X = self.transformer_.transform(X)

        # Calculate the leverage
        return calculate_leverage(X, self.estimator_)


    def _calculate_critical_value(self, X: np.ndarray) -> float:
        """Calculate the critical value for outlier detection using the percentile outlier method."""

        # Calculate the leverage of the samples
        leverage = calculate_leverage(X, self.estimator_)

        # Calculate the critical value
        return np.percentile(leverage, self.confidence * 100)



def calculate_leverage(X: np.ndarray, model: ModelTypes) -> np.ndarray:
    """
    Calculate the leverage of the training samples in a PLS/PCA-like model.

    Parameters
    ----------
    model : Union[_BasePCA, _PLS]
        A fitted PCA/PLS model

    X : np.ndarray
        Preprocessed input data

    Returns
    -------
    np.ndarray
        Leverage of the samples
    """

    X_transformed = model.transform(X)

    X_hat = (
        X_transformed @ np.linalg.inv(X_transformed.T @ X_transformed) @ X_transformed.T
    )

    return np.diag(X_hat)