Source code for chemotools.scale._pareto_scaler

"""
The :mod:`chemotools.scale._pareto_scaler` module implements a Pareto Scaler
transformer.
"""

# Authors: Pau Cabaneros
# License: MIT

import warnings
from numbers import Real

import numpy as np
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import check_is_fitted, validate_data


[docs] class ParetoScaler(TransformerMixin, OneToOneFeatureMixin, BaseEstimator): """ This transformer scales data using a generalized power of the standard deviation, as described by [1]_. It acts as a bridge between Mean Centering (P=0) and Autoscaling (P=1). Parameters ---------- p : float, default=0.5 The exponent to use in the scaling. Must be a non-negative float between 0 and 1. - p=0.0: No scaling (Mean Centering only). - p=0.5: Standard Pareto Scaling. - p=1.0: Autoscaling (Unit Variance scaling). with_mean : bool, default=True If True, center the data before scaling. If False, no centering is performed. copy : bool, default=True If True, a copy of the input data will be made. If False, the input data will be modified in place. Attributes ---------- mean_ : np.ndarray of shape (n_features,) The mean value for each feature, calculated during fitting. scale_ : np.ndarray of shape (n_features,) The scale factor for each feature, calculated as the standard deviation raised to the power of p. n_features_in_ : int The number of features in the input data. References ---------- .. [1] Kurt Varmuza and Peter Filzmoser (2024), Adjusted Pareto Scaling for Multivariate Calibration Models, Journal of Chemometrics, Volume 38, Issue 11, https://doi.org/10.1002/cem.3588. Examples -------- >>> from chemotools.datasets import load_fermentation_train >>> from chemotools.scale import ParetoScaler >>> # Load sample data >>> X, _ = load_fermentation_train() >>> scaler = ParetoScaler(p=0.3) ParetoScaler(p=0.3) >>> # Fit and transform the data >>> X_scaled = scaler.fit_transform(X) Notes ----- In spectroscopic applications, standard Pareto scaling (:math:`P=0.5`) is often used to reduce the dominance of large peaks (e.g., solvent or high-abundance metabolites) without inflating baseline noise as severely as autoscaling. According to Varmuza & Filzmoser, :math:`P` should be treated as a tunable hyperparameter. For datasets where relevant information is buried in low-intensity signals but the noise floor is high, an "Adjusted" :math:`P` (e.g., 0.3 or 0.7) may provide a superior balance of signal-to-noise ratio and model interpretability compared to fixed Pareto scaling. See also -------- sklearn.preprocessing.StandardScaler : Standardize features by removing the mean and scaling to unit variance. """ _parameter_constraints: dict = { "p": [Interval(Real, 0, 1, closed="both")], "with_mean": ["boolean"], "copy": ["boolean"], } def __init__( self, p: float = 0.5, with_mean: bool = True, copy: bool = True, ): self.p = p self.with_mean = with_mean self.copy = copy
[docs] def fit(self, X: np.ndarray, y=None) -> "ParetoScaler": """ Fit the transformer to the input data. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data to fit the transformer to. y : None Ignored to align with API. Returns ------- self : ParetoScaler The fitted transformer. """ # Validate input parameters self._validate_params() # Validate input data X = validate_data( self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64 ) # 1. Calculate Mean if self.with_mean: self.mean_ = np.mean(X, axis=0) else: self.mean_ = np.zeros(X.shape[1]) # 2. Calculate Standard Deviation (The base for Pareto) # Using ddof=0 for population standard deviation (ddof set to 0 to align with # StandardScaler) std = np.std(X, axis=0, ddof=0) # 3. Calculate Scale: std^p # Note: If p=0.5, it's sqrt(std), which is Pareto. self.scale_ = np.power(std, self.p) # 4. Handle Zero Variance / Constant Features # Catch cases where scale_ is 0 (which happens if std is 0 and p > 0) zero_scale_mask = np.isclose(self.scale_, 0) if np.any(zero_scale_mask): num_zeros = np.sum(zero_scale_mask) warnings.warn( f"The scale for {num_zeros} feature(s) is zero (constant columns). " "To avoid division by zero, these features will not be scaled.", UserWarning, ) # Set scale to 1.0 so that (X - mean) / 1.0 = (X - mean) self.scale_[zero_scale_mask] = 1.0 return self
[docs] def transform(self, X: np.ndarray, y=None) -> np.ndarray: """ Transform the input data. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The input data to transform. y : None Ignored to align with API. Returns ------- X_transformed : np.ndarray of shape (n_samples, n_features) The transformed data. """ # Check that the estimator is fitted check_is_fitted(self, "scale_") # Check that X is a 2D array and has only finite values X_ = validate_data( self, X, y="no_validation", ensure_2d=True, copy=self.copy, reset=False, dtype=np.float64, ) # Return the scaled data return (X_ - self.mean_) / self.scale_
[docs] def inverse_transform(self, X: np.ndarray) -> np.ndarray: """ Inverse transform the data back to the original space. Parameters ---------- X : np.ndarray of shape (n_samples, n_features) The data to inverse transform. Returns ------- X_original : np.ndarray of shape (n_samples, n_features) The data transformed back to the original space. """ # Check that the estimator is fitted check_is_fitted(self, "scale_") # Check that X is a 2D array and has only finite values X_ = validate_data( self, X, y="no_validation", ensure_2d=True, copy=self.copy, reset=False, dtype=np.float64, ) return X_ * self.scale_ + self.mean_