From 18a683b2289de26e2688c3edf76e3eb7ee0720c3 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Mar 2026 10:59:32 -0500 Subject: [PATCH 1/5] feat: Add GroupStandardScaler for scaling variables relative to a given group --- docs/api_doc/scaling/GroupStandardScaler.rst | 5 + docs/api_doc/scaling/index.rst | 1 + .../scaling/GroupStandardScaler.rst | 79 ++++++ docs/user_guide/scaling/index.rst | 1 + feature_engine/scaling/__init__.py | 2 + feature_engine/scaling/group_standard.py | 259 ++++++++++++++++++ tests/test_scaling/test_group_standard.py | 100 +++++++ 7 files changed, 447 insertions(+) create mode 100644 docs/api_doc/scaling/GroupStandardScaler.rst create mode 100644 docs/user_guide/scaling/GroupStandardScaler.rst create mode 100644 feature_engine/scaling/group_standard.py create mode 100644 tests/test_scaling/test_group_standard.py diff --git a/docs/api_doc/scaling/GroupStandardScaler.rst b/docs/api_doc/scaling/GroupStandardScaler.rst new file mode 100644 index 000000000..13df0fb46 --- /dev/null +++ b/docs/api_doc/scaling/GroupStandardScaler.rst @@ -0,0 +1,5 @@ +GroupStandardScaler +=================== + +.. autoclass:: feature_engine.scaling.GroupStandardScaler + :members: diff --git a/docs/api_doc/scaling/index.rst b/docs/api_doc/scaling/index.rst index d22216fde..989c62f16 100644 --- a/docs/api_doc/scaling/index.rst +++ b/docs/api_doc/scaling/index.rst @@ -10,3 +10,4 @@ given columns :maxdepth: 1 MeanNormalizationScaler + GroupStandardScaler diff --git a/docs/user_guide/scaling/GroupStandardScaler.rst b/docs/user_guide/scaling/GroupStandardScaler.rst new file mode 100644 index 000000000..5ac4c906a --- /dev/null +++ b/docs/user_guide/scaling/GroupStandardScaler.rst @@ -0,0 +1,79 @@ +.. _group_standard_scaler: + +.. currentmodule:: feature_engine.scaling + +GroupStandardScaler +=================== + +:class:`GroupStandardScaler()` scales numerical variables relative to a group. It standardises variables by removing the mean and scaling to unit variance per group. This means that for each group within the reference column, the scaler learns the mean and standard deviation of each variable to be scaled. During transformation, it applies the standard z-score formula. + +The :class:`GroupStandardScaler()` requires numerical variables to be scaled, and at least one reference variable which acts as the grouping key. + +Python example +-------------- + +We'll show how to use :class:`GroupStandardScaler()` through a toy dataset. Let's create a toy dataset: + +.. code:: python + + import pandas as pd + from feature_engine.scaling import GroupStandardScaler + + df = pd.DataFrame({ + "House_Price": [100000, 150000, 120000, 500000, 550000, 480000], + "Neighborhood": ["A", "A", "A", "B", "B", "B"] + }) + + print(df) + +The dataset looks like this: + +.. code:: python + + House_Price Neighborhood + 0 100000 A + 1 150000 A + 2 120000 A + 3 500000 B + 4 550000 B + 5 480000 B + +We want to scale the prices relative to the neighborhood so we know if a house is relatively expensive for its neighborhood. + +.. code:: python + + # set up the scaler + scaler = GroupStandardScaler( + variables=["House_Price"], + reference=["Neighborhood"] + ) + + # fit the scaler + scaler.fit(df) + +The scaler learns the mean and standard deviation of the House_Price per neighborhood: + +.. code:: python + + print(scaler.barycenter_) + # Means: {'House_Price': {'A': 123333.33333333333, 'B': 510000.0}} + + print(scaler.scale_) + # Std Devs: {'House_Price': {'A': 25166.11478423583, 'B': 36055.51275463989}} + +Now we can apply the transformation: + +.. code:: python + + df_scaled = scaler.transform(df) + print(df_scaled) + +.. code:: python + + House_Price Neighborhood + 0 -0.927172 A + 1 1.059626 A + 2 -0.132453 A + 3 -0.277349 B + 4 1.109312 B + 5 -0.831963 B diff --git a/docs/user_guide/scaling/index.rst b/docs/user_guide/scaling/index.rst index 244746028..2345c3406 100644 --- a/docs/user_guide/scaling/index.rst +++ b/docs/user_guide/scaling/index.rst @@ -57,3 +57,4 @@ Scalers :maxdepth: 1 MeanNormalizationScaler + GroupStandardScaler diff --git a/feature_engine/scaling/__init__.py b/feature_engine/scaling/__init__.py index 00137a550..908811f29 100644 --- a/feature_engine/scaling/__init__.py +++ b/feature_engine/scaling/__init__.py @@ -3,8 +3,10 @@ scaling methods. """ +from .group_standard import GroupStandardScaler from .mean_normalization import MeanNormalizationScaler __all__ = [ "MeanNormalizationScaler", + "GroupStandardScaler", ] diff --git a/feature_engine/scaling/group_standard.py b/feature_engine/scaling/group_standard.py new file mode 100644 index 000000000..4516a11fb --- /dev/null +++ b/feature_engine/scaling/group_standard.py @@ -0,0 +1,259 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine.dataframe_checks import ( + _check_optional_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import ( + check_numerical_variables, + find_numerical_variables, +) + + +class GroupStandardScaler(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + GroupStandardScaler() scales numerical variables relative to a group (e.g., + by standardizing them to have a mean of 0 and a standard deviation of 1 + within each group). + + The transformer takes a list of numerical `variables` to standardise and a list + of `reference` variables to group by. During fit, it learns the mean and + standard deviation of each variable per group. During transform, it scales the + variables applying the standard z-score formula per group. + + Unseen groups during `transform` will be scaled using the global mean and + standard deviation learned during `fit`. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of numerical variables to be scaled. If None, the transformer + will automatically find and select all numerical variables in the dataframe, + except those specified in the `reference` parameter. + + reference: str or list + The list of variables to use as the grouping key. These variables can be + categorical or numerical. + + Attributes + ---------- + barycenter_: + Dictionary with the mean value per group for each variable. + e.g. `{'var1': {grp1: 1.5, grp2: 3.0}}` + + scale_: + Dictionary with the standard deviation per group for each variable. + e.g. `{'var1': {grp1: 0.5, grp2: 1.0}}` + + global_mean_: + Dictionary with the global mean value for each variable (used for unseen groups). + + global_std_: + Dictionary with the global standard deviation for each variable. + + variables_: + The group of variables that will be transformed. + + reference_: + The variables used to perform the grouping. + + feature_names_in_: + List with the names of features seen during `fit`. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + Find the mean and standard deviation per group for each variable. + + fit_transform: + Fit to data, then transform it. + + transform: + Standardise the variables relative to their group. + + Examples + -------- + >>> import pandas as pd + >>> from feature_engine.scaling import GroupStandardScaler + >>> X = pd.DataFrame(dict( + ... x1 = [1, 2, 3, 10, 20, 30], + ... grp = ['A', 'A', 'A', 'B', 'B', 'B'] + ... )) + >>> gss = GroupStandardScaler(variables=['x1'], reference=['grp']) + >>> gss.fit(X) + >>> gss.transform(X) + x1 grp + 0 -1.0 A + 1 0.0 A + 2 1.0 A + 3 -1.0 B + 4 0.0 B + 5 1.0 B + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + reference: Union[int, str, List[Union[str, int]]] = None, + ) -> None: + + if reference is None: + raise ValueError("Parameter `reference` must be provided.") + + self.variables = _check_variables_input_value(variables) + self.reference = _check_variables_input_value(reference) + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the mean and standard deviation of each numerical variable per group. + + Parameters + ---------- + X: Pandas DataFrame of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, default=None + It is not needed in this transformer. You can pass y or None. + """ + # check input dataframe + X = check_X(X) + + self.reference_ = self.reference + + # Find variables to scale + if self.variables is None: + self.variables_ = find_numerical_variables(X) + # Remove reference variables if they were automatically picked up + self.variables_ = [ + var for var in self.variables_ if var not in self.reference_ + ] + else: + self.variables_ = check_numerical_variables(X, self.variables) + + # check that variables and reference are not overlapping + overlapping = set(self.variables_).intersection(set(self.reference_)) + if overlapping: + raise ValueError( + f"Variables {overlapping} are specified in both `variables` and `reference`. " + f"A variable cannot be both scaled and used as a grouping key." + ) + + # Check for missing values in variables and references + _check_optional_contains_na(X, self.variables_ + self.reference_) + + # Calculate group means and standard deviations + grouped = X.groupby(self.reference_)[self.variables_] + + self.barycenter_ = grouped.mean().to_dict() + self.scale_ = grouped.std(ddof=1).to_dict() + + # Handle groups with only 1 element that cause std=NaN + for var in self.variables_: + for grp, val in self.scale_[var].items(): + if pd.isna(val): + self.scale_[var][grp] = 0.0 + elif val == 0: + self.scale_[var][grp] = 0.0 # Just making sure for consistency + + # Calculate global parameters for unseen groups + self.global_mean_ = X[self.variables_].mean().to_dict() + self.global_std_ = X[self.variables_].std(ddof=1).to_dict() + + for var in self.variables_: + if pd.isna(self.global_std_[var]) or self.global_std_[var] == 0: + self.global_std_[var] = 1.0 + + # Save input features + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the variables relative to their group. + + Parameters + ---------- + X: Pandas DataFrame of shape = [n_samples, n_features] + The data to be transformed. + + Returns + ------- + X_new: pandas dataframe + The dataframe with the standardized variables. + """ + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check input data contains same number of columns as df used to fit + _check_X_matches_training_df(X, self.n_features_in_) + + # check if dataset contains na + _check_optional_contains_na(X, self.variables_ + self.reference_) + + # reorder df to match train set + X = X[self.feature_names_in_] + + X_transformed = X.copy() + + # We create a temporary grouping series if multiple references + if len(self.reference_) == 1: + group_keys = X_transformed[self.reference_[0]] + else: + group_keys = pd.Series( + list(zip(*[X_transformed[col] for col in self.reference_])), + index=X_transformed.index + ) + + for var in self.variables_: + # Extract means and stds for the groups found in X + means = group_keys.map(self.barycenter_[var]) + stds = group_keys.map(self.scale_[var]) + + # Fill in global mean and std for groups not seen during `fit` + means = means.fillna(self.global_mean_[var]) + stds = stds.fillna(self.global_std_[var]) + + # Also replace 0 standard deviation with 1 to avoid division by zero + stds = stds.replace(0, 1) + + # Standardise + X_transformed[var] = (X_transformed[var] - means) / stds + + return X_transformed + + def get_feature_names_out(self, input_features=None) -> List[str]: + """Get output feature names for transformation.""" + check_is_fitted(self) + return list(self.feature_names_in_) + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "numerical" + # This transformer has mandatory parameters (reference) + tags_dict["_xfail_checks"]["check_parameters_default_constructible"] = ( + "transformer has mandatory parameters" + ) + return tags_dict diff --git a/tests/test_scaling/test_group_standard.py b/tests/test_scaling/test_group_standard.py new file mode 100644 index 000000000..dbdf1d38f --- /dev/null +++ b/tests/test_scaling/test_group_standard.py @@ -0,0 +1,100 @@ +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.scaling import GroupStandardScaler + +def test_group_standard_scaler_single_reference(): + # Input data + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 3.0, 10.0, 20.0, 30.0], + "var2": [4.0, 5.0, 6.0, 40.0, 50.0, 60.0], + "grp": ["A", "A", "A", "B", "B", "B"], + } + ) + + # Expected Output + # Group A: var1 mean=2, std=1; var2 mean=5, std=1 + # Group B: var1 mean=20, std=10; var2 mean=50, std=10 + expected_df = pd.DataFrame( + { + "var1": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0], + "var2": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0], + "grp": ["A", "A", "A", "B", "B", "B"], + } + ) + + transformer = GroupStandardScaler(variables=["var1", "var2"], reference=["grp"]) + X = transformer.fit_transform(df) + + pd.testing.assert_frame_equal(X, expected_df) + + # Test attributes + assert transformer.variables_ == ["var1", "var2"] + assert transformer.reference_ == ["grp"] + assert transformer.barycenter_ == { + "var1": {"A": 2.0, "B": 20.0}, + "var2": {"A": 5.0, "B": 50.0}, + } + assert transformer.scale_ == { + "var1": {"A": 1.0, "B": 10.0}, + "var2": {"A": 1.0, "B": 10.0}, + } + +def test_unseen_groups(): + df_train = pd.DataFrame({ + "var1": [2.0, 4.0, 10.0, 20.0], + "grp": ["A", "A", "B", "B"] + }) + + # Group A var1: mean=3, std=1.414 + # Group B var1: mean=15, std=7.07 + # Global var1: mean=9, std=8.165 + + transformer = GroupStandardScaler(variables=["var1"], reference=["grp"]) + transformer.fit(df_train) + + df_test = pd.DataFrame({ + "var1": [3.0, 15.0, 9.0], + "grp": ["A", "B", "C"] # C is unseen + }) + + X = transformer.transform(df_test) + + # Expected calculation + # A (seen directly) : (3 - 3) / 1.414 = 0 + # B (seen directly) : (15 - 15) / 7.07 = 0 + # C (unseen, falls back to global): (9 - 9) / 8.165 = 0 + + expected_df = pd.DataFrame({ + "var1": [0.0, 0.0, 0.0], + "grp": ["A", "B", "C"] + }) + + pd.testing.assert_frame_equal(X, expected_df) + +def test_overlapping_variable_and_reference(): + df = pd.DataFrame({"var1": [1.0, 2.0], "grp": ["A", "B"]}) + transformer = GroupStandardScaler(variables=["var1"], reference=["var1"]) + with pytest.raises(ValueError): + transformer.fit(df) + +def test_non_fitted_error(): + df = pd.DataFrame({"var1": [1.0, 2.0], "grp": ["A", "B"]}) + transformer = GroupStandardScaler(reference=["grp"]) + with pytest.raises(NotFittedError): + transformer.transform(df) + +def test_missing_reference_param(): + with pytest.raises(ValueError, match="Parameter `reference` must be provided."): + GroupStandardScaler(variables=["var1"]) + +def test_dataset_contains_na(): + df_na = pd.DataFrame({ + "var1": [1.0, float('nan'), 3.0], + "grp": ["A", "A", "B"] + }) + transformer = GroupStandardScaler(reference=["grp"]) + with pytest.raises(ValueError): + transformer.fit(df_na) From 50cffb1250203c4c6766379f4ce34595b8a6a6f8 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Mar 2026 11:10:08 -0500 Subject: [PATCH 2/5] fix: Resolve flake8 spacing and mypy type hint issues --- feature_engine/scaling/group_standard.py | 15 +++++++-------- tests/test_scaling/test_group_standard.py | 18 ++++++++++++------ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/feature_engine/scaling/group_standard.py b/feature_engine/scaling/group_standard.py index 4516a11fb..e67fd7bc2 100644 --- a/feature_engine/scaling/group_standard.py +++ b/feature_engine/scaling/group_standard.py @@ -25,8 +25,8 @@ class GroupStandardScaler(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): """ - GroupStandardScaler() scales numerical variables relative to a group (e.g., - by standardizing them to have a mean of 0 and a standard deviation of 1 + GroupStandardScaler() scales numerical variables relative to a group (e.g., + by standardizing them to have a mean of 0 and a standard deviation of 1 within each group). The transformer takes a list of numerical `variables` to standardise and a list @@ -34,7 +34,7 @@ class GroupStandardScaler(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix standard deviation of each variable per group. During transform, it scales the variables applying the standard z-score formula per group. - Unseen groups during `transform` will be scaled using the global mean and + Unseen groups during `transform` will be scaled using the global mean and standard deviation learned during `fit`. More details in the :ref:`User Guide `. @@ -55,13 +55,12 @@ class GroupStandardScaler(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix barycenter_: Dictionary with the mean value per group for each variable. e.g. `{'var1': {grp1: 1.5, grp2: 3.0}}` - scale_: Dictionary with the standard deviation per group for each variable. e.g. `{'var1': {grp1: 0.5, grp2: 1.0}}` global_mean_: - Dictionary with the global mean value for each variable (used for unseen groups). + Dictionary with the global mean value for each variable (for unseen groups). global_std_: Dictionary with the global standard deviation for each variable. @@ -112,7 +111,7 @@ class GroupStandardScaler(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, - reference: Union[int, str, List[Union[str, int]]] = None, + reference: Optional[Union[int, str, List[Union[str, int]]]] = None, ) -> None: if reference is None: @@ -161,7 +160,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Calculate group means and standard deviations grouped = X.groupby(self.reference_)[self.variables_] - + self.barycenter_ = grouped.mean().to_dict() self.scale_ = grouped.std(ddof=1).to_dict() @@ -171,7 +170,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if pd.isna(val): self.scale_[var][grp] = 0.0 elif val == 0: - self.scale_[var][grp] = 0.0 # Just making sure for consistency + self.scale_[var][grp] = 0.0 # Just making sure for consistency # Calculate global parameters for unseen groups self.global_mean_ = X[self.variables_].mean().to_dict() diff --git a/tests/test_scaling/test_group_standard.py b/tests/test_scaling/test_group_standard.py index dbdf1d38f..abd107f6c 100644 --- a/tests/test_scaling/test_group_standard.py +++ b/tests/test_scaling/test_group_standard.py @@ -4,6 +4,7 @@ from feature_engine.scaling import GroupStandardScaler + def test_group_standard_scaler_single_reference(): # Input data df = pd.DataFrame( @@ -13,7 +14,7 @@ def test_group_standard_scaler_single_reference(): "grp": ["A", "A", "A", "B", "B", "B"], } ) - + # Expected Output # Group A: var1 mean=2, std=1; var2 mean=5, std=1 # Group B: var1 mean=20, std=10; var2 mean=50, std=10 @@ -42,16 +43,17 @@ def test_group_standard_scaler_single_reference(): "var2": {"A": 1.0, "B": 10.0}, } + def test_unseen_groups(): df_train = pd.DataFrame({ "var1": [2.0, 4.0, 10.0, 20.0], "grp": ["A", "A", "B", "B"] }) - + # Group A var1: mean=3, std=1.414 # Group B var1: mean=15, std=7.07 # Global var1: mean=9, std=8.165 - + transformer = GroupStandardScaler(variables=["var1"], reference=["grp"]) transformer.fit(df_train) @@ -59,14 +61,14 @@ def test_unseen_groups(): "var1": [3.0, 15.0, 9.0], "grp": ["A", "B", "C"] # C is unseen }) - + X = transformer.transform(df_test) - + # Expected calculation # A (seen directly) : (3 - 3) / 1.414 = 0 # B (seen directly) : (15 - 15) / 7.07 = 0 # C (unseen, falls back to global): (9 - 9) / 8.165 = 0 - + expected_df = pd.DataFrame({ "var1": [0.0, 0.0, 0.0], "grp": ["A", "B", "C"] @@ -74,22 +76,26 @@ def test_unseen_groups(): pd.testing.assert_frame_equal(X, expected_df) + def test_overlapping_variable_and_reference(): df = pd.DataFrame({"var1": [1.0, 2.0], "grp": ["A", "B"]}) transformer = GroupStandardScaler(variables=["var1"], reference=["var1"]) with pytest.raises(ValueError): transformer.fit(df) + def test_non_fitted_error(): df = pd.DataFrame({"var1": [1.0, 2.0], "grp": ["A", "B"]}) transformer = GroupStandardScaler(reference=["grp"]) with pytest.raises(NotFittedError): transformer.transform(df) + def test_missing_reference_param(): with pytest.raises(ValueError, match="Parameter `reference` must be provided."): GroupStandardScaler(variables=["var1"]) + def test_dataset_contains_na(): df_na = pd.DataFrame({ "var1": [1.0, float('nan'), 3.0], From 9abffa46fde4aa741f1a47811aa024545fb7a921 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Mar 2026 11:11:29 -0500 Subject: [PATCH 3/5] fix: Resolve flake8 E501 line length issue --- feature_engine/scaling/group_standard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feature_engine/scaling/group_standard.py b/feature_engine/scaling/group_standard.py index e67fd7bc2..86fd962e4 100644 --- a/feature_engine/scaling/group_standard.py +++ b/feature_engine/scaling/group_standard.py @@ -151,7 +151,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): overlapping = set(self.variables_).intersection(set(self.reference_)) if overlapping: raise ValueError( - f"Variables {overlapping} are specified in both `variables` and `reference`. " + f"Variables {overlapping} are specified in both `variables` " + f"and `reference`. " f"A variable cannot be both scaled and used as a grouping key." ) From c7444efbfbbdb9949ac971814d1901c25e85fba1 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:27:19 -0500 Subject: [PATCH 4/5] chore: trigger CI rerun From 0b266f82e44ad56daf83aa4b1f33244a1379314d Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:51:26 -0500 Subject: [PATCH 5/5] test: add coverage for GroupStandardScaler (variables=None, multi-ref, std edge cases, get_feature_names_out, _more_tags) --- tests/test_scaling/test_group_standard.py | 87 +++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/test_scaling/test_group_standard.py b/tests/test_scaling/test_group_standard.py index abd107f6c..4fd15e1a1 100644 --- a/tests/test_scaling/test_group_standard.py +++ b/tests/test_scaling/test_group_standard.py @@ -104,3 +104,90 @@ def test_dataset_contains_na(): transformer = GroupStandardScaler(reference=["grp"]) with pytest.raises(ValueError): transformer.fit(df_na) + + +def test_variables_none_auto_detect(): + """Test fit with variables=None auto-detects numerical and excludes reference.""" + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 3.0, 10.0, 20.0, 30.0], + "var2": [4.0, 5.0, 6.0, 40.0, 50.0, 60.0], + "grp": ["A", "A", "A", "B", "B", "B"], + } + ) + transformer = GroupStandardScaler(reference=["grp"]) + transformer.fit(df) + assert transformer.variables_ == ["var1", "var2"] + X = transformer.transform(df) + assert list(X.columns) == transformer.get_feature_names_out() + + +def test_single_element_group_zero_std(): + """Test groups with one element or zero std (scale 0, replaced in transform).""" + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 2.0, 2.0], + "grp": ["A", "B", "B", "B"], + } + ) + # Group A: single element -> std NaN -> set to 0 in fit + # Group B: constant 2.0 -> std 0 + transformer = GroupStandardScaler(variables=["var1"], reference=["grp"]) + transformer.fit(df) + assert transformer.scale_["var1"]["A"] == 0.0 + assert transformer.scale_["var1"]["B"] == 0.0 + X = transformer.transform(df) + # In transform, 0 std is replaced with 1, so (x - mean) / 1 + assert "var1" in X.columns + + +def test_global_std_zero_or_nan(): + """Test global std NaN/0 fallback to 1.0 for unseen groups.""" + df = pd.DataFrame({"var1": [5.0, 5.0, 5.0], "grp": ["A", "A", "B"]}) + transformer = GroupStandardScaler(variables=["var1"], reference=["grp"]) + transformer.fit(df) + assert transformer.global_std_["var1"] == 1.0 + df_test = pd.DataFrame({"var1": [5.0], "grp": ["C"]}) + X = transformer.transform(df_test) + pd.testing.assert_frame_equal( + X, pd.DataFrame({"var1": [0.0], "grp": ["C"]}) + ) + + +def test_multiple_reference_columns(): + """Test transform with multiple reference variables (group_keys from zip).""" + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 3.0, 10.0, 20.0, 30.0], + "r1": ["X", "X", "X", "Y", "Y", "Y"], + "r2": ["a", "a", "b", "a", "b", "b"], + } + ) + transformer = GroupStandardScaler( + variables=["var1"], reference=["r1", "r2"] + ) + transformer.fit(df) + X = transformer.transform(df) + assert X.shape[0] == 6 + assert list(X.columns) == transformer.get_feature_names_out() + + +def test_get_feature_names_out(): + """Test get_feature_names_out returns feature names in order.""" + df = pd.DataFrame( + {"var1": [1.0, 2.0], "var2": [3.0, 4.0], "grp": ["A", "B"]} + ) + transformer = GroupStandardScaler( + variables=["var1", "var2"], reference=["grp"] + ) + transformer.fit(df) + names = transformer.get_feature_names_out() + assert names == ["var1", "var2", "grp"] + + +def test_more_tags(): + """Test transformer tags for sklearn compatibility.""" + gss = GroupStandardScaler(variables=["x"], reference=["g"]) + tags = gss._more_tags() + assert tags["variables"] == "numerical" + assert "check_parameters_default_constructible" in tags["_xfail_checks"]