chemotools.adaptation._direct_standardization 源代码

"""
The :mod: `chemotools.adaptation._direct_standardization`
module implements the Direct Standardization (DS) transformer
"""

# Authors: Ruggero Guerrini
# License: MIT

import warnings

import numpy as np
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
from sklearn.utils.validation import (
    check_is_fitted,
    validate_data,
)

from chemotools._doc_mixin import DocLinkMixin


[文档] class DirectStandardization( DocLinkMixin, OneToOneFeatureMixin, TransformerMixin, BaseEstimator ): """ Direct Standardization (DS) is a transformer used for domain adaptation (calibration ) applications. The transformer uses least squares to find a linear map from the target instrument space to the source instrument space, following the implementation by [1]_. Parameters ---------- None The transformer has no constructor hyperparameters. Attributes ---------- V_ds_ : np.ndarray of shape (n_features, n_transfer_samples) Right singular vectors of the target transfer data X, used as the column basis of the low-rank transformation. Empty (shape ``(n_features, 0)``) when no ``X_source`` was provided. B_ds_ : np.ndarray of shape (n_transfer_samples, n_features) Projection of the source transfer data onto the singular basis. The full transformation is ``X @ V_ds_ @ B_ds_``, which is algebraically equivalent to ``X @ T_`` but requires only ``O(n_features × n_transfer_samples)`` storage instead of ``O(n_features²)``. x_source_provided_ : bool Boolean value to flag if X_source was provided during fitting Raises ------ ValueError If X and X_source do not have the same shape. See Also -------- PiecewiseDirectStandardization : Localized version using windowed PLS regression. References ---------- .. [1] Wang, Yongdong., Veltkamp, D. J., & Kowalski, B. R. (1991), Multivariate instrument standardization, Analytical Chemistry, 63(23), Pages 2750–2756, https://doi.org/10.1021/ac00023a016. Examples -------- **Basic usage** >>> import numpy as np >>> from chemotools.adaptation import DirectStandardization >>> >>> rng = np.random.default_rng(17) >>> X_source = rng.normal(size=(100, 20)) >>> X_target = X_source * 2 - rng.normal(size=(100, 20)) * 0.02 >>> >>> ds = DirectStandardization().fit(X_target, X_source=X_source) >>> X_transf = ds.transform(X_target) """ # Fitted attributes (set during fit, typed for type checkers) n_features_in_: int V_ds_: np.ndarray B_ds_: np.ndarray x_source_provided_: bool _parameter_constraints: dict = {}
[文档] def fit( self, X: np.ndarray, y=None, *, X_source: np.ndarray | None = None ) -> "DirectStandardization": """ Fit the Direct Standardization model. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) Data from the target instrument. y : None Ignored to align with API. X_source : np.ndarray of shape (n_samples, n_features), optional Data from the source instrument. If None, the transformer defaults to an identity transformation. Returns ------- self : DirectStandardization """ # Validate the input parameters self._validate_params() # Check that X is a 2D array and has only finite values X = validate_data(self, X, ensure_2d=True, reset=True, dtype=np.float64) # If X_source is None, default to identity transformation if X_source is None: warnings.warn( "X_source is None, the transformer will act as an identity " "transformation." ) self.V_ds_ = np.empty((X.shape[1], 0)) self.B_ds_ = np.empty((0, X.shape[1])) self.x_source_provided_ = False return self # Check that X_source is a 2D array and has only finite values X_source = validate_data( self, X_source, ensure_2d=True, reset=False, dtype=np.float64 ) # Check consistency between X and X_source if X_source.shape != X.shape: raise ValueError( f"X and X_source must have the same shape, " f"got X={X.shape} and X_source={X_source.shape}." ) # Low-rank factorisation: T_* = V_ds_ @ B_ds_ # rank(T_*) <= n_transfer_samples << n_features, so storing the two # thin factors is O(n_features * r) vs O(n_features^2) for the full # matrix, and transform becomes two small matmuls instead of one huge one. U, s, Vt = np.linalg.svd(X, full_matrices=False) # Numerical cutoff eps = np.finfo(s.dtype).eps tol = max(X.shape) * np.amax(s) * eps # Inversion of s with cutoff s_inv = np.zeros_like(s) mask = s > tol s_inv[mask] = 1.0 / s[mask] # Transformation using the safe s_inv self.V_ds_ = Vt.T # (n_features, r) self.B_ds_ = s_inv[:, None] * (U.T @ X_source) # (r, n_features) self.x_source_provided_ = True return self
[文档] def transform(self, X) -> np.ndarray: """ Transform the data from the target space to the source space using the map ``self.T_``. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data to transform Returns ------- X_transf : np.ndarray of shape (n_samples, n_features) The data transformed """ # Check that the estimator is fitted check_is_fitted(self, "V_ds_") # Validate the input data X = validate_data( self, X, ensure_2d=True, reset=False, dtype=np.float64, ) # Identity fallback: no source data was provided at fit time if not self.x_source_provided_: return X # Low-rank transform: equivalent to X @ T_* but O(n * p * r) instead # of O(n * p^2), where r = n_transfer_samples << p = n_features. return (X @ self.V_ds_) @ self.B_ds_