"""
The :mod:`chemotools.outliers._leverage` module implements the Leverage
outlier detection algorithm.
"""
# Authors: Pau Cabaneros
# License: MIT
from typing import Optional, Union
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils._param_validation import Interval, Real
from ._base import _ModelResidualsBase, ModelTypes
[docs]
class Leverage(_ModelResidualsBase):
"""
Calculate the leverage of the training samples on the latent space of a PLS model.
This method allows to detect datapoints with high leverage in the model.
Parameters
----------
model : Union[ModelType, Pipeline]
A fitted PLSRegression model or Pipeline ending with such a model
confidence : float, default=0.95
Confidence level for statistical calculations (between 0 and 1)
Attributes
----------
estimator_ : ModelType
The fitted model of type _PLS
transformer_ : Optional[Pipeline]
Preprocessing steps before the model
n_features_in_ : int
Number of features in the input data
n_components_ : int
Number of components in the model
n_samples_ : int
Number of samples used to train the model
critical_value_ : float
The calculated critical value for outlier detection
References
----------
[1] Kim H. Esbensen,
"Multivariate Data Analysis - In Practice", 5th Edition, 2002.
Examples
--------
>>> from sklearn.cross_decomposition import PLSRegression
>>> from chemotools.outliers import Leverage
>>> X = np.random.rand(100, 10)
>>> y = np.random.rand(100)
>>> pls = PLSRegression(n_components=3).fit(X, y)
>>> # Initialize Leverage with the fitted PLS model
>>> leverage = Leverage(pls, confidence=0.95)
Leverage(model=PLSRegression(n_components=3), confidence=0.95)
>>> leverage.fit(X, y)
>>> # Predict outliers in the dataset
>>> outliers = leverage.predict(X)
>>> # Get the leverage of the samples
>>> residuals = leverage.predict_residuals(X)
"""
_parameter_constraints: dict = {
"model": [Pipeline, ModelTypes],
"confidence": [Interval(Real, 0, 1, closed="both")],
}
def __init__(
self, model: Union[ModelTypes, Pipeline], confidence: float = 0.95
) -> None:
model, confidence = model, confidence
super().__init__(model, confidence)
[docs]
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "Leverage":
"""
Fit the model to the input data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data
y : array-like of shape (n_samples,), default=None
Target data
Returns
-------
self : Leverage
Fitted estimator with the critical threshold computed
"""
X = validate_data(
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
)
if self.transformer_:
X = self.transformer_.fit_transform(X)
# Compute the critical threshold
self.critical_value_ = self._calculate_critical_value(X)
return self
[docs]
def predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
"""Calculate Leverage for training data on the model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data
Returns
-------
ndarray of shape (n_samples,)
Bool with samples with a leverage above the critical value
"""
return super().predict(X, y)
[docs]
def predict_residuals(
self, X: np.ndarray, y: Optional[np.ndarray] = None, validate: bool = True
) -> np.ndarray:
"""Calculate the leverage of the samples.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data
Returns
-------
np.ndarray
Leverage of the samples
"""
# Check the estimator has been fitted
check_is_fitted(self, ["critical_value_"])
# Validate the input data
if validate:
X = validate_data(self, X, ensure_2d=True, dtype=np.float64)
# Apply preprocessing if available
if self.transformer_:
X = self.transformer_.transform(X)
# Calculate the leverage
return calculate_leverage(X, self.estimator_)
def _calculate_critical_value(self, X: np.ndarray) -> float:
"""Calculate the critical value for outlier detection using the percentile outlier method."""
# Calculate the leverage of the samples
leverage = calculate_leverage(X, self.estimator_)
# Calculate the critical value
return np.percentile(leverage, self.confidence * 100)
def calculate_leverage(X: np.ndarray, model: ModelTypes) -> np.ndarray:
"""
Calculate the leverage of the training samples in a PLS/PCA-like model.
Parameters
----------
model : Union[_BasePCA, _PLS]
A fitted PCA/PLS model
X : np.ndarray
Preprocessed input data
Returns
-------
np.ndarray
Leverage of the samples
"""
X_transformed = model.transform(X)
X_hat = (
X_transformed @ np.linalg.inv(X_transformed.T @ X_transformed) @ X_transformed.T
)
return np.diag(X_hat)