"""
The :mod:`chemotools.projection._orthogonal_pls` module implements the Orthogonal
Projection to Latent Structures (OPLS) technique for preprocessing spectral data by
removing variations orthogonal to the target variable.
"""
# Author: Pau Cabaneros
# License: MIT
from numbers import Integral
import numpy as np
from scipy.linalg import svd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted, validate_data
[docs]
class OrthogonalPLS(TransformerMixin, BaseEstimator):
"""
A transformer that removes variation in X that is orthogonal to the target y using
Orthogonal Projection to Latent Structures (OPLS) [1]_.
OPLS extends PLS by explicitly separating X into predictive variation,
correlated with y, and orthogonal variation, unrelated to y. At each
iteration, a predictive weight vector is estimated using the PLS criterion
(maximizing covariance between X and y). Scores and loadings are then
computed, and the loading vector is decomposed into a component aligned
with the predictive weight and a component orthogonal to it. The orthogonal
component defines an orthogonal score vector, which is used to deflate X.
This procedure is repeated to remove multiple orthogonal components while
retaining the predictive structure. Multivariate targets are supported via
decomposition of the cross-covariance matrix.
The transformer returns X with orthogonal variation removed, preserving the
original number of features.
Parameters
----------
n_components : int, default=1
The number of orthogonal components to compute. This determines how many
orthogonal variations will be removed from the data.
copy : bool, default=False
If True, a copy of the input data is created and used for computations.
If False, the input data is modified in place.
Attributes
----------
x_weights_ : ndarray of shape (n_features, n_components)
The weights of the original components.
x_weights_orth_ : ndarray of shape (n_features, n_components)
The weights of the orthogonal components.
x_loadings_ : ndarray of shape (n_features, n_components)
The loadings of the original components.
x_loadings_orth_ : ndarray of shape (n_features, n_components)
The loadings of the orthogonal components.
x_scores_ : ndarray of shape (n_samples, n_components)
The scores of the original components.
x_scores_orth_ : ndarray of shape (n_samples, n_components)
The scores of the orthogonal components.
mean_X_ : ndarray of shape (n_features,)
The mean of the original data `X` used for centering.
mean_y_ : float or ndarray of shape (n_targets,)
The mean of the target variable `y` used for centering.
retained_variance_ratio_ : float
The proportion of variance in `X` retained explained by the predictive
components.
removed_variance_ratio_ : float
The proportion of variance in `X` removed explained by the orthogonal
components.
References
----------
.. [1] Trygg, J., & Wold, S. (2002).
Orthogonal projections to latent structures (O-PLS).
Journal of Chemometrics, Volume 16, Issue 3, Pages 119-128,
https://doi.org/10.1002/cem.695.
Examples
--------
Fit and apply OrthogonalPLS to remove variation in `X` that is orthogonal to
`y`.
>>> import numpy as np
>>> from chemotools.projection import OrthogonalPLS
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([1, 2, 3])
>>> opls = OrthogonalPLS(n_components=1)
>>> opls.fit(X, y)
OrthogonalPLS(n_components=1, copy=False)
>>> X_transformed = opls.transform(X, y)
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"copy": ["boolean"],
}
def __init__(self, n_components: int = 1, copy=False):
"""Initialize the OrthogonalPLS transformer.
Parameters
----------
n_components : int, default=1
The number of orthogonal components to compute. This determines how many
orthogonal variations will be removed from the data.
copy : bool, default=False
If True, a copy of the input data is created and used for computations.
If False, the input data is modified in place.
"""
self.n_components = n_components
self.copy = copy
[docs]
def fit(self, X: np.ndarray, y: np.ndarray) -> "OrthogonalPLS":
"""Fit the OrthogonalPLS model to the training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data to fit the model to.
y : array-like of shape (n_samples,)
The target values.
Returns
-------
self : OrthogonalPLS
Fitted estimator.
"""
# Check that X is a 2D array and has only finite values
self._validate_params()
X, y = validate_data(
self,
X,
y=y,
ensure_2d=True,
reset=True,
copy=self.copy,
dtype=np.float64,
multi_output=True,
)
y = np.asarray(y, dtype=np.float64)
# Validate that there are at least 2 samples
n_samples, n_features = X.shape
if n_samples < 2:
raise ValueError(
"n_samples=1 is not enough for OrthogonalPLS (OPLS). "
"At least 2 samples are required."
)
# Validate that n_components does not exceed the rank of X after centering.
# Centering reduces the effective rank in the sample direction by 1, so the
# maximum number of meaningful components is min(n_samples - 1, n_features).
max_components = min(n_samples - 1, n_features)
if self.n_components > max_components:
raise ValueError(
f"n_components={self.n_components} is too large. "
f"After mean-centering, the effective rank of X is at most "
f"min(n_samples - 1, n_features) = {max_components}. "
f"Set n_components to a value <= {max_components}."
)
# Center the data
self.mean_X_ = np.mean(X, axis=0)
self.mean_y_ = np.mean(y, axis=0) if y.ndim == 2 else np.mean(y)
Xk = X - self.mean_X_
yk = y - self.mean_y_
# Get the dimensions
n = n_samples
p = n_features
yk = yk.reshape(-1, 1) if yk.ndim == 1 else yk
# Allocate scores and weights
self.x_weights_ = np.zeros((p, self.n_components)) # w in [1]
self.x_weights_orth_ = np.zeros((p, self.n_components)) # w_ortho in [1]
self.x_loadings_ = np.zeros((p, self.n_components)) # p in [1]
self.x_loadings_orth_ = np.zeros((p, self.n_components)) # p_ortho in [1]
self.x_scores_ = np.zeros((n, self.n_components)) # t in [1]
self.x_scores_orth_ = np.zeros((n, self.n_components)) # t_ortho in [1]
# Calculate total sum of squares for X
total_ss_x = np.sum(Xk**2)
if total_ss_x == 0:
raise ValueError(
"X has zero variance after mean-centering. OrthogonalPLS requires "
"X to contain at least some non-constant features."
)
# For each component
for k in range(self.n_components):
# Step 1: Weights are calculated through SVD to support multi y (Step 1
# in [1])
# Step 1.1. Calculate covariance matrix (C)
C = np.dot(Xk.T, yk)
# Step 1.2: Calculate the reduced SVD of C since only the leading
# left singular vector is used
U, _, _ = svd(C, full_matrices=False)
# Step 1.3: We just use the first weight
x_weights = U[:, 0]
# Step 2. Normalize the weights (Step 2 in [1])
x_weights /= np.linalg.norm(x_weights)
# Step 3: Calculate the x_scores (Step 3 in [1])
x_scores = np.dot(Xk, x_weights) / np.dot(x_weights.T, x_weights)
# Step 4: Calculate the x_loadings (Step 6 in [1])
x_loadings = np.dot(x_scores.T, Xk) / np.dot(x_scores.T, x_scores)
x_loadings = x_loadings.T
# Step 5: Calculate orthogonal x weights (Step 7 in [1])
x_weights_orth = (
x_loadings
- (np.dot(x_weights.T, x_loadings) / np.dot(x_weights.T, x_weights))
* x_weights
)
# Step 6: Normalize the orthogonal weights (Step 8 in [1])
x_weights_orth /= np.linalg.norm(x_weights_orth)
# Step 7: Calculate orthogonal x scores (Step 9 in [1])
x_scores_orth = np.dot(Xk, x_weights_orth) / np.dot(
x_weights_orth.T, x_weights_orth
)
# Step 8: Calculate orthogonal x loadings (Step 10 in [1])
x_loadings_orth = np.dot(x_scores_orth.T, Xk) / np.dot(
x_scores_orth.T, x_scores_orth
)
# Step 9: Deflation of X matrix (Step 11 in [1])
Xk -= np.outer(x_scores_orth, x_loadings_orth)
# Step 10: Collect the variables
self.x_weights_[:, k] = x_weights
self.x_weights_orth_[:, k] = x_weights_orth
self.x_loadings_[:, k] = x_loadings
self.x_loadings_orth_[:, k] = x_loadings_orth
self.x_scores_[:, k] = x_scores
self.x_scores_orth_[:, k] = x_scores_orth
# Step 11: Calculate sum of squares in defleated Xk
total_ss_x_k = np.sum(Xk**2)
# Step 12: Calculate variance ratio in prediction matrix
self.retained_variance_ratio_ = total_ss_x_k / total_ss_x
self.removed_variance_ratio_ = 1 - self.retained_variance_ratio_
return self