Source code for chemotools.projection._orthogonal_pls

"""
The :mod:`chemotools.projection._orthogonal_pls` module implements the Orthogonal
Projection to Latent Structures (OPLS) technique for preprocessing spectral data by
removing variations orthogonal to the target variable.
"""

# Author: Pau Cabaneros
# License: MIT

from numbers import Integral

import numpy as np
from scipy.linalg import svd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted, validate_data


[docs] class OrthogonalPLS(TransformerMixin, BaseEstimator): """ A transformer that removes variation in X that is orthogonal to the target y using Orthogonal Projection to Latent Structures (OPLS) [1]_. OPLS extends PLS by explicitly separating X into predictive variation, correlated with y, and orthogonal variation, unrelated to y. At each iteration, a predictive weight vector is estimated using the PLS criterion (maximizing covariance between X and y). Scores and loadings are then computed, and the loading vector is decomposed into a component aligned with the predictive weight and a component orthogonal to it. The orthogonal component defines an orthogonal score vector, which is used to deflate X. This procedure is repeated to remove multiple orthogonal components while retaining the predictive structure. Multivariate targets are supported via decomposition of the cross-covariance matrix. The transformer returns X with orthogonal variation removed, preserving the original number of features. Parameters ---------- n_components : int, default=1 The number of orthogonal components to compute. This determines how many orthogonal variations will be removed from the data. copy : bool, default=False If True, a copy of the input data is created and used for computations. If False, the input data is modified in place. Attributes ---------- x_weights_ : ndarray of shape (n_features, n_components) The weights of the original components. x_weights_orth_ : ndarray of shape (n_features, n_components) The weights of the orthogonal components. x_loadings_ : ndarray of shape (n_features, n_components) The loadings of the original components. x_loadings_orth_ : ndarray of shape (n_features, n_components) The loadings of the orthogonal components. x_scores_ : ndarray of shape (n_samples, n_components) The scores of the original components. x_scores_orth_ : ndarray of shape (n_samples, n_components) The scores of the orthogonal components. mean_X_ : ndarray of shape (n_features,) The mean of the original data `X` used for centering. mean_y_ : float or ndarray of shape (n_targets,) The mean of the target variable `y` used for centering. retained_variance_ratio_ : float The proportion of variance in `X` retained explained by the predictive components. removed_variance_ratio_ : float The proportion of variance in `X` removed explained by the orthogonal components. References ---------- .. [1] Trygg, J., & Wold, S. (2002). Orthogonal projections to latent structures (O-PLS). Journal of Chemometrics, Volume 16, Issue 3, Pages 119-128, https://doi.org/10.1002/cem.695. Examples -------- Fit and apply OrthogonalPLS to remove variation in `X` that is orthogonal to `y`. >>> import numpy as np >>> from chemotools.projection import OrthogonalPLS >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([1, 2, 3]) >>> opls = OrthogonalPLS(n_components=1) >>> opls.fit(X, y) OrthogonalPLS(n_components=1, copy=False) >>> X_transformed = opls.transform(X, y) """ _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left")], "copy": ["boolean"], } def __init__(self, n_components: int = 1, copy=False): """Initialize the OrthogonalPLS transformer. Parameters ---------- n_components : int, default=1 The number of orthogonal components to compute. This determines how many orthogonal variations will be removed from the data. copy : bool, default=False If True, a copy of the input data is created and used for computations. If False, the input data is modified in place. """ self.n_components = n_components self.copy = copy
[docs] def fit(self, X: np.ndarray, y: np.ndarray) -> "OrthogonalPLS": """Fit the OrthogonalPLS model to the training data. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to fit the model to. y : array-like of shape (n_samples,) The target values. Returns ------- self : OrthogonalPLS Fitted estimator. """ # Check that X is a 2D array and has only finite values self._validate_params() X, y = validate_data( self, X, y=y, ensure_2d=True, reset=True, copy=self.copy, dtype=np.float64, multi_output=True, ) y = np.asarray(y, dtype=np.float64) # Validate that there are at least 2 samples n_samples, n_features = X.shape if n_samples < 2: raise ValueError( "n_samples=1 is not enough for OrthogonalPLS (OPLS). " "At least 2 samples are required." ) # Validate that n_components does not exceed the rank of X after centering. # Centering reduces the effective rank in the sample direction by 1, so the # maximum number of meaningful components is min(n_samples - 1, n_features). max_components = min(n_samples - 1, n_features) if self.n_components > max_components: raise ValueError( f"n_components={self.n_components} is too large. " f"After mean-centering, the effective rank of X is at most " f"min(n_samples - 1, n_features) = {max_components}. " f"Set n_components to a value <= {max_components}." ) # Center the data self.mean_X_ = np.mean(X, axis=0) self.mean_y_ = np.mean(y, axis=0) if y.ndim == 2 else np.mean(y) Xk = X - self.mean_X_ yk = y - self.mean_y_ # Get the dimensions n = n_samples p = n_features yk = yk.reshape(-1, 1) if yk.ndim == 1 else yk # Allocate scores and weights self.x_weights_ = np.zeros((p, self.n_components)) # w in [1] self.x_weights_orth_ = np.zeros((p, self.n_components)) # w_ortho in [1] self.x_loadings_ = np.zeros((p, self.n_components)) # p in [1] self.x_loadings_orth_ = np.zeros((p, self.n_components)) # p_ortho in [1] self.x_scores_ = np.zeros((n, self.n_components)) # t in [1] self.x_scores_orth_ = np.zeros((n, self.n_components)) # t_ortho in [1] # Calculate total sum of squares for X total_ss_x = np.sum(Xk**2) if total_ss_x == 0: raise ValueError( "X has zero variance after mean-centering. OrthogonalPLS requires " "X to contain at least some non-constant features." ) # For each component for k in range(self.n_components): # Step 1: Weights are calculated through SVD to support multi y (Step 1 # in [1]) # Step 1.1. Calculate covariance matrix (C) C = np.dot(Xk.T, yk) # Step 1.2: Calculate the reduced SVD of C since only the leading # left singular vector is used U, _, _ = svd(C, full_matrices=False) # Step 1.3: We just use the first weight x_weights = U[:, 0] # Step 2. Normalize the weights (Step 2 in [1]) x_weights /= np.linalg.norm(x_weights) # Step 3: Calculate the x_scores (Step 3 in [1]) x_scores = np.dot(Xk, x_weights) / np.dot(x_weights.T, x_weights) # Step 4: Calculate the x_loadings (Step 6 in [1]) x_loadings = np.dot(x_scores.T, Xk) / np.dot(x_scores.T, x_scores) x_loadings = x_loadings.T # Step 5: Calculate orthogonal x weights (Step 7 in [1]) x_weights_orth = ( x_loadings - (np.dot(x_weights.T, x_loadings) / np.dot(x_weights.T, x_weights)) * x_weights ) # Step 6: Normalize the orthogonal weights (Step 8 in [1]) x_weights_orth /= np.linalg.norm(x_weights_orth) # Step 7: Calculate orthogonal x scores (Step 9 in [1]) x_scores_orth = np.dot(Xk, x_weights_orth) / np.dot( x_weights_orth.T, x_weights_orth ) # Step 8: Calculate orthogonal x loadings (Step 10 in [1]) x_loadings_orth = np.dot(x_scores_orth.T, Xk) / np.dot( x_scores_orth.T, x_scores_orth ) # Step 9: Deflation of X matrix (Step 11 in [1]) Xk -= np.outer(x_scores_orth, x_loadings_orth) # Step 10: Collect the variables self.x_weights_[:, k] = x_weights self.x_weights_orth_[:, k] = x_weights_orth self.x_loadings_[:, k] = x_loadings self.x_loadings_orth_[:, k] = x_loadings_orth self.x_scores_[:, k] = x_scores self.x_scores_orth_[:, k] = x_scores_orth # Step 11: Calculate sum of squares in defleated Xk total_ss_x_k = np.sum(Xk**2) # Step 12: Calculate variance ratio in prediction matrix self.retained_variance_ratio_ = total_ss_x_k / total_ss_x self.removed_variance_ratio_ = 1 - self.retained_variance_ratio_ return self
[docs] def transform(self, X: np.ndarray, y=None) -> np.ndarray: """Apply the OrthogonalPLS correction to X This returns the predictive part of the data, i.e. the variation in X that is related to y, after removing the orthogonal part (variation in X that is not related to y). Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to transform. y : None Ignored to align with API. Returns ------- X_transformed : array-like of shape (n_samples, n_features) The transformed data. """ # Check that the estimator is fitted check_is_fitted(self, "x_weights_orth_") # Validate input data X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=False, copy=self.copy, dtype=np.float64, ) # Mean center the new data Xc = X - self.mean_X_ # Return the transformed data for k in range(self.n_components): # Calculate scores for the NEW data using learned weights scores_orth = np.dot(Xc, self.x_weights_orth_[:, k]) # Subtract the learned loading pattern Xc -= np.outer(scores_orth, self.x_loadings_orth_[:, k]) return Xc + self.mean_X_