diff --git a/.gitignore b/.gitignore index 3ba72acd9..0096d1595 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule # Environments .env .venv +.venv_wsl env/ venv/ ENV/ diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index 3ee3222fb..7f9ed486a 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -53,6 +53,8 @@ New transformers Enhancements ~~~~~~~~~~~~ +- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) +- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py index b4ae3478f..276bc1e26 100644 --- a/feature_engine/encoding/base_encoder.py +++ b/feature_engine/encoding/base_encoder.py @@ -6,27 +6,21 @@ from sklearn.utils.validation import check_is_fitted from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) -from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring + _missing_values_docstring, _variables_categorical_docstring) +from feature_engine._docstrings.init_parameters.encoders import \ + _ignore_format_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.dataframe_checks import ( - _check_optional_contains_na, - _check_X_matches_training_df, - check_X, -) +from feature_engine.dataframe_checks import (_check_optional_contains_na, + _check_X_matches_training_df, + check_X) from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def _encode(self, X: pd.DataFrame) -> pd.DataFrame: # replace categories by the learned parameters for feature in self.encoder_dict_.keys(): + # Detect unseen categories BEFORE mapping so we can name them + if self.unseen == "warn": + unseen_cats = set(X[feature].dropna().unique()) - set( + self.encoder_dict_[feature].keys() + ) + if unseen_cats: + warnings.warn( + f"Variable {feature!r} contains unseen categories: " + f"{unseen_cats}. These will be encoded as NaN.", + UserWarning, + ) + X[feature] = X[feature].map(self.encoder_dict_[feature]) # if original variables are cast as categorical, they will remain @@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X): "During the encoding, NaN values were introduced in the feature(s) " f"{nan_columns_str}." ) + # 'warn': per-variable warnings were already issued in _encode before + # the mapping, so nothing more to do here. def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: """Convert the encoded variable back to the original values. diff --git a/feature_engine/encoding/count_frequency.py b/feature_engine/encoding/count_frequency.py index ae6507627..854e3ea26 100644 --- a/feature_engine/encoding/count_frequency.py +++ b/feature_engine/encoding/count_frequency.py @@ -6,34 +6,26 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring, + _variables_attribute_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) + _missing_values_docstring, _variables_categorical_docstring) from feature_engine._docstrings.init_parameters.encoders import ( - _ignore_format_docstring, - _unseen_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _inverse_transform_docstring, - _transform_encoders_docstring, -) + _ignore_format_docstring, _unseen_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _inverse_transform_docstring, + _transform_encoders_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.encoding._helper_functions import check_parameter_unseen -from feature_engine.encoding.base_encoder import ( - CategoricalInitMixinNA, - CategoricalMethodsMixin, -) +from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA, + CategoricalMethodsMixin) _unseen_docstring = ( _unseen_docstring + """ If `'encode'`, unseen categories will be encoded as 0 (zero).""" + + """ If `'warn'`, unseen categories will be encoded as NaN and a""" + + """ UserWarning is raised listing the unseen categories per variable.""" ) @@ -166,7 +158,7 @@ def __init__( f"Got {encoding_method} instead." ) - check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) + check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) super().__init__(variables, missing_values, ignore_format) self.encoding_method = encoding_method self.unseen = unseen diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..2d1f48e97 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -1,33 +1,26 @@ # Authors: Soledad Galli # License: BSD 3 clause +import warnings from typing import List, Optional, Union import pandas as pd -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _imputer_dict_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _transform_imputers_docstring, -) + _feature_names_in_docstring, _imputer_dict_docstring, + _n_features_in_docstring, _variables_attribute_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _transform_imputers_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. + errors : str, default='raise' + Indicates what to do when the selected imputation_method='frequent' + and a variable has more than 1 mode. + + If 'raise', raises a ValueError and stops the fit. + + If 'warn', raises a UserWarning and continues, imputing using the + first most frequent category found. + + If 'ignore', continues without warnings, imputing using the first + most frequent category found. + Attributes ---------- {imputer_dict_} @@ -135,6 +140,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, + errors: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -144,11 +150,18 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + "errors takes only values 'raise', 'warn', or 'ignore'. " + f"Got {errors} instead." + ) + self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format + self.errors = errors def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - raise ValueError( - f"The variable {var} contains multiple frequent categories." - ) + if self.errors == "raise": + raise ValueError( + f"The variable {var} contains multiple " + f"frequent categories. Set errors='warn' or " + f"errors='ignore' to allow imputation using " + f"the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable {var} has multiple frequent " + f"categories. The first category found, " + f"{mode_vals[0]}, will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = {var: mode_vals[0]} @@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): varnames_str = ", ".join(varnames) else: varnames_str = varnames[0] - raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories." - ) + + if self.errors == "raise": + raise ValueError( + f"The variable(s) {varnames_str} contain(s) " + f"multiple frequent categories. Set " + f"errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent " + f"category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable(s) {varnames_str} have multiple " + f"frequent categories. The first category " + f"found will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = mode_vals.iloc[0].to_dict() diff --git a/tests/test_encoding/test_count_frequency_encoder.py b/tests/test_encoding/test_count_frequency_encoder.py index 55e13b1cc..c447a1e37 100644 --- a/tests/test_encoding/test_count_frequency_encoder.py +++ b/tests/test_encoding/test_count_frequency_encoder.py @@ -6,6 +6,7 @@ from sklearn.exceptions import NotFittedError from feature_engine.encoding import CountFrequencyEncoder +from feature_engine.encoding._helper_functions import check_parameter_unseen # init parameters @@ -237,7 +238,7 @@ def test_no_error_triggered_when_df_contains_unseen_categories_and_unseen_is_enc encoder.transform(df_enc_rare) -@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"]) +@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"]) def test_fit_raises_error_if_df_contains_na(errors, df_enc_na): # test case 4: when dataset contains na, fit method encoder = CountFrequencyEncoder(unseen=errors) @@ -251,7 +252,7 @@ def test_fit_raises_error_if_df_contains_na(errors, df_enc_na): assert str(record.value) == msg -@pytest.mark.parametrize("errors", ["raise", "ignore", "encode"]) +@pytest.mark.parametrize("errors", ["raise", "ignore", "encode", "warn"]) def test_transform_raises_error_if_df_contains_na(errors, df_enc, df_enc_na): # test case 4: when dataset contains na, transform method encoder = CountFrequencyEncoder(unseen=errors) @@ -476,3 +477,88 @@ def test_inverse_transform_raises_non_fitted_error(): # Test when fit is not called prior to transform. with pytest.raises(NotFittedError): enc.inverse_transform(df1) + + +# --------------------------------------------------------------------------- +# Tests for unseen='warn' +# --------------------------------------------------------------------------- + +@pytest.fixture +def train_test_dfs_warn(): + X_train = pd.DataFrame({"color": ["red", "red", "blue", "green", "blue"]}) + X_test = pd.DataFrame({"color": ["red", "blue", "yellow"]}) # 'yellow' unseen + return X_train, X_test + + +def test_unseen_warn_emits_userwarning(train_test_dfs_warn): + """unseen='warn': UserWarning emitted for unseen categories.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning, match="unseen categories"): + encoder.transform(X_test) + + +def test_unseen_warn_encodes_as_nan(train_test_dfs_warn): + """unseen='warn': unseen categories should become NaN.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning): + X_tr = encoder.transform(X_test) + # 'yellow' is unseen — should be NaN + assert pd.isna(X_tr.loc[X_tr.index[2], "color"]) + + +def test_unseen_warn_known_categories_encoded_correctly(train_test_dfs_warn): + """unseen='warn': known categories still encoded correctly.""" + X_train, X_test = train_test_dfs_warn + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with pytest.warns(UserWarning): + X_tr = encoder.transform(X_test) + # 'red' appears 2 times in training + assert X_tr.loc[X_tr.index[0], "color"] == 2 + + +def test_unseen_warn_no_warning_when_no_unseen(train_test_dfs_warn): + """unseen='warn': no warning if all categories were seen during fit.""" + X_train, _ = train_test_dfs_warn + X_test_seen = pd.DataFrame({"color": ["red", "blue"]}) + encoder = CountFrequencyEncoder(encoding_method="count", unseen="warn") + encoder.fit(X_train) + with warnings.catch_warnings(): + warnings.simplefilter("error") # Fail if any warning raised + encoder.transform(X_test_seen) + + +def test_unseen_invalid_value_raises(): + """Invalid unseen value should raise ValueError at init.""" + with pytest.raises(ValueError, match="takes only values"): + CountFrequencyEncoder(unseen="bad_value") + + +# ============================================================================= +# NEW TESTS — added to fix codecov patch coverage +# ============================================================================= + +def test_check_parameter_unseen_raises_when_accepted_values_is_not_a_list(): + """ + Covers the first raise ValueError in check_parameter_unseen(): + + if not isinstance(accepted_values, list) or not all( + isinstance(item, str) for item in accepted_values + ): + raise ValueError("accepted_values should be a list of strings ...") + + check_parameter_unseen() is an internal helper. CountFrequencyEncoder always + calls it with a hardcoded valid list, so the guard is never triggered through + normal usage — it must be tested by importing and calling the function directly. + """ + # accepted_values is not a list at all + with pytest.raises(ValueError, match="accepted_values should be a list of strings"): + check_parameter_unseen("raise", "raise") + + # accepted_values is a list but contains a non-string element + with pytest.raises(ValueError, match="accepted_values should be a list of strings"): + check_parameter_unseen("raise", ["raise", "ignore", 42]) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 182e8826b..1c0640a58 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,9 +1,23 @@ +import warnings + +import numpy as np import pandas as pd import pytest from feature_engine.imputation import CategoricalImputer +# --- Shared fixture: perfectly multimodal variable --- +@pytest.fixture +def multimodal_df(): + return pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + + def test_impute_with_string_missing_and_automatically_find_variables(df_na): # set up transformer imputer = CategoricalImputer(imputation_method="missing", variables=None) @@ -150,14 +164,22 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): - msg = "The variable Name contains multiple frequent categories." + msg = ( + "The variable Name contains multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg - msg = "The variable(s) Name contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_na) @@ -166,7 +188,11 @@ def test_error_when_variable_contains_multiple_modes(df_na): df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_) @@ -305,3 +331,88 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): # check that error message matches assert str(record.value) == msg + + +def test_errors_raise_on_multimodal_is_default(multimodal_df): + """Default behaviour: raise ValueError on multimodal variable.""" + imputer = CategoricalImputer(imputation_method="frequent") + with pytest.raises(ValueError, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_emits_userwarning(multimodal_df): + """errors='warn': UserWarning must be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_uses_first_mode(multimodal_df): + """errors='warn': imputer_dict_ should contain the first mode.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning): + imputer.fit(multimodal_df) + expected = multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["city"] == expected + + +def test_errors_ignore_no_warning_raised(multimodal_df): + """errors='ignore': no warnings should be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + with warnings.catch_warnings(): + warnings.simplefilter("error") # Promote all warnings to errors + imputer.fit(multimodal_df) # Should NOT raise + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_errors_invalid_value_raises(): + """Passing an unsupported value for errors should raise ValueError at init.""" + with pytest.raises(ValueError, match="errors takes only values"): + CategoricalImputer(imputation_method="frequent", errors="bad_value") + + +def test_errors_param_ignored_when_imputation_method_is_missing(): + """errors param has no effect for imputation_method='missing'.""" + df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) + imputer = CategoricalImputer(imputation_method="missing", errors="warn") + # Should fit without warnings since there's no mode computation + with warnings.catch_warnings(): + warnings.simplefilter("error") + imputer.fit(df) + + +def test_errors_ignore_single_variable(): + """errors='ignore' on single multimodal variable — silent, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + + +def test_errors_ignore_multiple_variables(): + """errors='ignore' on multiple multimodal variables — silent, uses first mode.""" + X = pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] + + +def test_errors_warn_single_variable(): + """errors='warn' on single multimodal variable — warns, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city"], errors="warn" + ) + with pytest.warns(UserWarning, match="Variable city has multiple frequent"): + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0]