Source code for chemotools.projection._direct_orthogonalization

"""
The :mod:`chemotools.projection._direct_orthogonalization` module implements the Direct
Orthogonalization (DO) technique for preprocessing spectral data by removing variations
orthogonal to the target variable.
"""

# Author: Pau Cabaneros
# License: MIT

from numbers import Integral

import numpy as np
from scipy.linalg import svd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted, validate_data


[docs] class DirectOrthogonalization(TransformerMixin, BaseEstimator): """ Remove variation in X that is uncorrelated with the target y using Direct Orthogonalization (DO) [1]_ [2]_. DO removes from X systematic variation that is independent of y. X is orthogonalized with respect to y, PCA is performed on the orthogonalized matrix to estimate orthogonal components, and those components are subtracted from X to obtain the corrected data. The transformer returns the corrected matrix with the same number of features, retaining variation relevant for predicting y. Inputs are typically assumed to be mean-centered. Parameters ---------- n_components : int, default=1 The number of orthogonal components to compute. This determines how many orthogonal variations will be removed from the data. copy : bool, default=False If True, a copy of the input data is created and used for computations. If False, the input data is modified in place. Attributes ---------- x_weights_orth_ : ndarray of shape (n_features, n_components) The weights of the orthogonal components. x_loadings_orth_ : ndarray of shape (n_features, n_components) The loadings of the orthogonal components. x_scores_orth_ : ndarray of shape (n_samples, n_components) The scores of the orthogonal components. mean_X_ : ndarray of shape (n_features,) The mean of the original data `X` used for centering. mean_y_ : float or ndarray of shape (n_targets,) The mean of the target variable `y` used for centering. retained_variance_ratio_ : float The proportion of variance in `X` retained explained by the predictive components. removed_variance_ratio_ : float The proportion of variance in `X` removed explained by the orthogonal components. References ---------- .. [1] Clause A. Andersson, (1999). Direct orthogonalization. Chemometrics and Intelligent Laboratory Systems, Volume 47, Issue 1, Pages 51-63, https://doi.org/10.1016/S0169-7439(98)00158-0. .. [2] O. Svensson, T. Kourti, J. F. MacGregor (2002). An investigation of orthogonal signal correction algorithms and their characteristics. Journal of Chemometrics, Volume 16, Issue 4, Pages 176-188, https://doi.org/10.1002/cem.700 Examples -------- Fit and apply DirectOrthogonalization to remove variation in `X` that is orthogonal to `y`. >>> import numpy as np >>> from chemotools.projection import DirectOrthogonalization >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([1, 2, 3]) >>> do = DirectOrthogonalization(n_components=1) >>> do.fit(X, y) DirectOrthogonalization(n_components=1, copy=False) >>> X_transformed = do.transform(X, y) """ _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left")], "copy": ["boolean"], } def __init__(self, n_components: int = 1, copy=False): """Initialize the DirectOrthogonalization transformer. Parameters ---------- n_components : int, default=1 The number of orthogonal components to compute. This determines how many orthogonal variations will be removed from the data. copy : bool, default=False If True, a copy of the input data is created and used for computations. If False, the input data is modified in place. """ self.n_components = n_components self.copy = copy
[docs] def fit(self, X: np.ndarray, y: np.ndarray) -> "DirectOrthogonalization": """Fit the DirectOrthogonalization model to the training data. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to fit the model to. y : array-like of shape (n_samples,) The target values. Returns ------- self : DirectOrthogonalization Fitted estimator. """ # Check that X is a 2D array and has only finite values self._validate_params() X, y = validate_data( self, X, y=y, ensure_2d=True, reset=True, copy=self.copy, dtype=np.float64, multi_output=True, ) y = np.asarray(y, dtype=np.float64) # Validate that there are at least 2 samples n_samples = X.shape[0] if n_samples < 2: raise ValueError( "n_samples=1 is not enough for direct orthogonalization. " "At least 2 samples are required." ) # Center the data self.mean_X_ = np.mean(X, axis=0) self.mean_y_ = np.mean(y, axis=0) if y.ndim == 2 else np.mean(y) X_centered = X - self.mean_X_ y_centered = y - self.mean_y_ yk = y_centered.reshape(-1, 1) if y_centered.ndim == 1 else y_centered # Calculate total sum of squares for X total_ss_x = np.sum(X_centered**2) if total_ss_x == 0: raise ValueError( "X has zero variance after mean-centering. OrthogonalPLS requires " "X to contain at least some non-constant features." ) # Step 1: Orthogonalize X with respect to y using regression (Step 2 in [1]) # Generalization to multivariate y. coef_yx is w in [1]. # This is equivalent to: # coef_yx = np.linalg.pinv(yk.T @ yk) @ yk.T @ X_centered, # but using lstsq is more numerically stable and handles rank-deficient cases. coef_yx, _, _, _ = np.linalg.lstsq(yk, X_centered, rcond=None) # Step 2: Deflate X by removing the variation explained by y (Step 2 in [1]) Xk = X_centered - yk @ coef_yx # Step 3: SVD of the deflated Xk to get orthogonal components (Step 3 in [1]) _, _, Vt = svd(Xk, full_matrices=False) self.x_loadings_orth_ = Vt[: self.n_components].T # Check that n_components is not greater than the maximum possible components max_components = Vt.shape[0] if self.n_components > max_components: raise ValueError( "Number of components must be less than or equal to the number " "of features." ) # Step 4: Calculate orthogonal scores (Step 4.1 in [1]) self.x_scores_orth_ = np.dot(X_centered, self.x_loadings_orth_) # Step 5: Orthogonal weights are the same as loadings (Table 1 in [2]) self.x_weights_orth_ = self.x_loadings_orth_ # Step 6: Deflate X globally (vectorized version of Step 4.2 in [1]) X_deflated = X_centered - np.dot(self.x_scores_orth_, self.x_loadings_orth_.T) # Step 7: Calculate sum of squares in defleated Xk total_ss_x_k = np.sum(X_deflated**2) # Step 8: Calculate variance ratio in prediction matrix self.retained_variance_ratio_ = total_ss_x_k / total_ss_x self.removed_variance_ratio_ = 1 - self.retained_variance_ratio_ return self
[docs] def transform(self, X: np.ndarray, y=None) -> np.ndarray: """Apply the Direct Orthogonalization (D) correction to X This returns the predictive part of the data, i.e. the variation in X that is related to y, after removing the orthogonal part (variation in X that is not related to y). Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to transform. y : None Ignored to align with API. Returns ------- X_transformed : array-like of shape (n_samples, n_features) The transformed data. """ # Check that the estimator is fitted check_is_fitted(self, "x_weights_orth_") # Validate input data X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=False, copy=self.copy, dtype=np.float64, ) # Mean center the new data Xc = X - self.mean_X_ # Transform the data t_ortho = np.dot(Xc, self.x_weights_orth_) Xc -= np.dot(t_ortho, self.x_loadings_orth_.T) return Xc + self.mean_X_