From fb230feeb6acc8b5da972133750eeb5278d0f7c2 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:00:45 +0530 Subject: [PATCH 01/16] feat(CategoricalImputer): add errors param to handle multimodal variables (#904) --- docs/whats_new/v_190.rst | 1 + feature_engine/imputation/categorical.py | 54 +++++++++++-- .../test_categorical_imputer.py | 77 ++++++++++++++++++- 3 files changed, 122 insertions(+), 10 deletions(-) diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index 3ee3222fb..f1b6e22da 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -53,6 +53,7 @@ New transformers Enhancements ~~~~~~~~~~~~ +- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..40c0a1276 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -2,6 +2,7 @@ # License: BSD 3 clause from typing import List, Optional, Union +import warnings import pandas as pd @@ -88,6 +89,18 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. + errors : str, default='raise' + Indicates what to do when the selected imputation_method='frequent' + and a variable has more than 1 mode. + + If 'raise', raises a ValueError and stops the fit. + + If 'warn', raises a UserWarning and continues, imputing using the + first most frequent category found. + + If 'ignore', continues without warnings, imputing using the first + most frequent category found. + Attributes ---------- {imputer_dict_} @@ -135,6 +148,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, + errors: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -144,11 +158,18 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + "errors takes only values 'raise', 'warn', or 'ignore'. " + f"Got {errors} instead." + ) + self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format + self.errors = errors def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -189,9 +210,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - raise ValueError( - f"The variable {var} contains multiple frequent categories." - ) + if self.errors == "raise": + raise ValueError( + f"The variable {var} contains multiple frequent categories. " + f"Set errors='warn' or errors='ignore' to allow imputation " + f"using the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable {var} has multiple frequent categories. " + f"The first category found, {mode_vals[0]}, will be used " + f"for imputation.", + UserWarning, + ) self.imputer_dict_ = {var: mode_vals[0]} @@ -208,10 +239,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): varnames_str = ", ".join(varnames) else: varnames_str = varnames[0] - raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories." - ) + + if self.errors == "raise": + raise ValueError( + f"The variable(s) {varnames_str} contain(s) multiple frequent " + f"categories. Set errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable(s) {varnames_str} have multiple frequent categories. " + f"The first category found will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = mode_vals.iloc[0].to_dict() diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 182e8826b..1e55212d5 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,8 +1,19 @@ +import numpy as np +import pandas as pd import pandas as pd import pytest +import warnings from feature_engine.imputation import CategoricalImputer +# --- Shared fixture: perfectly multimodal variable --- +@pytest.fixture +def multimodal_df(): + return pd.DataFrame({ + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + }) + def test_impute_with_string_missing_and_automatically_find_variables(df_na): # set up transformer @@ -150,14 +161,22 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): - msg = "The variable Name contains multiple frequent categories." + msg = ( + "The variable Name contains multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg - msg = "The variable(s) Name contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_na) @@ -166,7 +185,11 @@ def test_error_when_variable_contains_multiple_modes(df_na): df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_) @@ -305,3 +328,51 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): # check that error message matches assert str(record.value) == msg + + +def test_errors_raise_on_multimodal_is_default(multimodal_df): + """Default behaviour: raise ValueError on multimodal variable.""" + imputer = CategoricalImputer(imputation_method="frequent") + with pytest.raises(ValueError, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_emits_userwarning(multimodal_df): + """errors='warn': UserWarning must be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_uses_first_mode(multimodal_df): + """errors='warn': imputer_dict_ should contain the first mode.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning): + imputer.fit(multimodal_df) + expected = multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["city"] == expected + + +def test_errors_ignore_no_warning_raised(multimodal_df): + """errors='ignore': no warnings should be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + with warnings.catch_warnings(): + warnings.simplefilter("error") # Promote all warnings to errors + imputer.fit(multimodal_df) # Should NOT raise + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_errors_invalid_value_raises(): + """Passing an unsupported value for errors should raise ValueError at init.""" + with pytest.raises(ValueError, match="errors takes only values"): + CategoricalImputer(imputation_method="frequent", errors="bad_value") + + +def test_errors_param_ignored_when_imputation_method_is_missing(): + """errors param has no effect for imputation_method='missing'.""" + df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) + imputer = CategoricalImputer(imputation_method="missing", errors="warn") + # Should fit without warnings since there's no mode computation + with warnings.catch_warnings(): + warnings.simplefilter("error") + imputer.fit(df) From 81be3489fb56fc80ab1f8906bc5d12111bb19858 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:41:13 +0530 Subject: [PATCH 02/16] style: fix flake8 line length in CategoricalImputer --- feature_engine/imputation/categorical.py | 28 +++++++++++-------- .../test_categorical_imputer.py | 11 +++++--- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 40c0a1276..cc1c2e2d2 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -212,15 +212,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if len(mode_vals) > 1: if self.errors == "raise": raise ValueError( - f"The variable {var} contains multiple frequent categories. " - f"Set errors='warn' or errors='ignore' to allow imputation " - f"using the first most frequent category found." + f"The variable {var} contains multiple " + f"frequent categories. Set errors='warn' or " + f"errors='ignore' to allow imputation using " + f"the first most frequent category found." ) elif self.errors == "warn": warnings.warn( - f"Variable {var} has multiple frequent categories. " - f"The first category found, {mode_vals[0]}, will be used " - f"for imputation.", + f"Variable {var} has multiple frequent " + f"categories. The first category found, " + f"{mode_vals[0]}, will be used for imputation.", UserWarning, ) @@ -242,14 +243,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if self.errors == "raise": raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories. Set errors='warn' or errors='ignore' to allow " - f"imputation using the first most frequent category found." + f"The variable(s) {varnames_str} contain(s) " + f"multiple frequent categories. Set " + f"errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent " + f"category found." ) elif self.errors == "warn": warnings.warn( - f"Variable(s) {varnames_str} have multiple frequent categories. " - f"The first category found will be used for imputation.", + f"Variable(s) {varnames_str} have multiple " + f"frequent categories. The first category " + f"found will be used for imputation.", UserWarning, ) @@ -301,4 +305,4 @@ def _more_tags(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True - return tags + return tags \ No newline at end of file diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 1e55212d5..c6ea41d89 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -6,13 +6,16 @@ from feature_engine.imputation import CategoricalImputer + # --- Shared fixture: perfectly multimodal variable --- @pytest.fixture def multimodal_df(): - return pd.DataFrame({ - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], - "country": ["UK", "UK", "FR", "FR", "DE", "DE"], - }) + return pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) def test_impute_with_string_missing_and_automatically_find_variables(df_na): From 4fb5b7aa6cd37077cd91a046df8bf921e02e52b6 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:48:01 +0530 Subject: [PATCH 03/16] style: fix import order and duplicate pandas import --- feature_engine/imputation/categorical.py | 32 +++++++------------ .../test_categorical_imputer.py | 1 - 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index cc1c2e2d2..2d1f48e97 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -1,34 +1,26 @@ # Authors: Soledad Galli # License: BSD 3 clause -from typing import List, Optional, Union import warnings +from typing import List, Optional, Union import pandas as pd -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _imputer_dict_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _transform_imputers_docstring, -) + _feature_names_in_docstring, _imputer_dict_docstring, + _n_features_in_docstring, _variables_attribute_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _transform_imputers_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -305,4 +297,4 @@ def _more_tags(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True - return tags \ No newline at end of file + return tags diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index c6ea41d89..788a7b924 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import pandas as pd import pytest import warnings From 835133f4c12b072f09310d6a17c4f81aaadbc11f Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 22:49:48 +0530 Subject: [PATCH 04/16] test: add coverage for errors='ignore' branches --- .../test_categorical_imputer.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 788a7b924..995db0c69 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,7 +1,8 @@ +import warnings + import numpy as np import pandas as pd import pytest -import warnings from feature_engine.imputation import CategoricalImputer @@ -378,3 +379,27 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): with warnings.catch_warnings(): warnings.simplefilter("error") imputer.fit(df) + + +def test_errors_ignore_single_variable(): + """errors='ignore' on single multimodal variable — silent, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + + +def test_errors_ignore_multiple_variables(): + """errors='ignore' on multiple multimodal variables — silent, uses first mode.""" + X = pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] \ No newline at end of file From 81f31d8af4613b2fbfd2b7ebbdbc6f3fa087c4b7 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 22:53:33 +0530 Subject: [PATCH 05/16] style: add missing newline at end of test file --- tests/test_imputation/test_categorical_imputer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 995db0c69..de4ce0bc4 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -402,4 +402,4 @@ def test_errors_ignore_multiple_variables(): imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] - assert imputer.imputer_dict_["country"] == X["country"].mode()[0] \ No newline at end of file + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] From 9e3bb5cc8edccd7f3648170a571d04b6ed67f54d Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 23:16:13 +0530 Subject: [PATCH 06/16] style: fix import order in count_frequency and base_encoder --- docs/whats_new/v_190.rst | 1 + fail_detail.txt | 77 ++++++++++++++++++ feature_engine/encoding/base_encoder.py | 44 ++++++---- feature_engine/encoding/count_frequency.py | 32 +++----- test_results.txt | Bin 0 -> 13284 bytes test_results_utf8.txt | 21 +++++ .../test_count_frequency_encoder.py | 63 +++++++++++++- 7 files changed, 198 insertions(+), 40 deletions(-) create mode 100644 fail_detail.txt create mode 100644 test_results.txt create mode 100644 test_results_utf8.txt diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index f1b6e22da..7f9ed486a 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -54,6 +54,7 @@ Enhancements ~~~~~~~~~~~~ - Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) +- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG diff --git a/fail_detail.txt b/fail_detail.txt new file mode 100644 index 000000000..514d0fb79 --- /dev/null +++ b/fail_detail.txt @@ -0,0 +1,77 @@ +============================= test session starts ============================= +platform win32 -- Python 3.14.0, pytest-9.0.2, pluggy-1.6.0 +rootdir: F:\feature_engine +configfile: pyproject.toml +plugins: anyio-4.12.1, dash-4.0.0, cov-7.0.0, timeout-2.4.0 +collected 1 item + +tests\test_encoding\test_count_frequency_encoder.py F + +================================== FAILURES =================================== +______________________ test_unseen_invalid_value_raises _______________________ + + def test_unseen_invalid_value_raises(): + """Invalid unseen value should raise ValueError at init.""" + with pytest.raises(ValueError, match="unseen takes only values"): +> CountFrequencyEncoder(unseen="bad_value") + +tests\test_encoding\test_count_frequency_encoder.py:537: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = <[AttributeError("'CountFrequencyEncoder' object has no attribute 'encoding_method'") raised in repr()] CountFrequencyEncoder object at 0x11445ccaf90> +encoding_method = 'count', variables = None, missing_values = 'raise' +ignore_format = False, unseen = 'bad_value' + + def __init__( + self, + encoding_method: str = "count", + variables: Union[None, int, str, List[Union[str, int]]] = None, + missing_values: str = "raise", + ignore_format: bool = False, + unseen: str = "ignore", + ) -> None: + + if encoding_method not in ["count", "frequency"]: + raise ValueError( + "encoding_method takes only values 'count' and 'frequency'. " + f"Got {encoding_method} instead." + ) + +> check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) + +feature_engine\encoding\count_frequency.py:171: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +unseen = 'bad_value', accepted_values = ['ignore', 'raise', 'encode', 'warn'] + + def check_parameter_unseen(unseen, accepted_values): + if not isinstance(accepted_values, list) or not all( + isinstance(item, str) for item in accepted_values + ): + raise ValueError( + "accepted_values should be a list of strings. " + f" Got {accepted_values} instead." + ) + if unseen not in accepted_values: +> raise ValueError( + f"Parameter `unseen` takes only values {', '.join(accepted_values)}." + f" Got {unseen} instead." + ) +E ValueError: Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead. + +feature_engine\encoding\_helper_functions.py:10: ValueError + +During handling of the above exception, another exception occurred: + + def test_unseen_invalid_value_raises(): + """Invalid unseen value should raise ValueError at init.""" +> with pytest.raises(ValueError, match="unseen takes only values"): + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E AssertionError: Regex pattern did not match. +E Expected regex: 'unseen takes only values' +E Actual message: 'Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead.' + +tests\test_encoding\test_count_frequency_encoder.py:536: AssertionError +=========================== short test summary info =========================== +FAILED tests/test_encoding/test_count_frequency_encoder.py::test_unseen_invalid_value_raises +============================== 1 failed in 0.28s ============================== diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py index b4ae3478f..276bc1e26 100644 --- a/feature_engine/encoding/base_encoder.py +++ b/feature_engine/encoding/base_encoder.py @@ -6,27 +6,21 @@ from sklearn.utils.validation import check_is_fitted from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) -from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring + _missing_values_docstring, _variables_categorical_docstring) +from feature_engine._docstrings.init_parameters.encoders import \ + _ignore_format_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.dataframe_checks import ( - _check_optional_contains_na, - _check_X_matches_training_df, - check_X, -) +from feature_engine.dataframe_checks import (_check_optional_contains_na, + _check_X_matches_training_df, + check_X) from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def _encode(self, X: pd.DataFrame) -> pd.DataFrame: # replace categories by the learned parameters for feature in self.encoder_dict_.keys(): + # Detect unseen categories BEFORE mapping so we can name them + if self.unseen == "warn": + unseen_cats = set(X[feature].dropna().unique()) - set( + self.encoder_dict_[feature].keys() + ) + if unseen_cats: + warnings.warn( + f"Variable {feature!r} contains unseen categories: " + f"{unseen_cats}. These will be encoded as NaN.", + UserWarning, + ) + X[feature] = X[feature].map(self.encoder_dict_[feature]) # if original variables are cast as categorical, they will remain @@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X): "During the encoding, NaN values were introduced in the feature(s) " f"{nan_columns_str}." ) + # 'warn': per-variable warnings were already issued in _encode before + # the mapping, so nothing more to do here. def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: """Convert the encoded variable back to the original values. diff --git a/feature_engine/encoding/count_frequency.py b/feature_engine/encoding/count_frequency.py index ae6507627..854e3ea26 100644 --- a/feature_engine/encoding/count_frequency.py +++ b/feature_engine/encoding/count_frequency.py @@ -6,34 +6,26 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring, + _variables_attribute_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _missing_values_docstring, - _variables_categorical_docstring, -) + _missing_values_docstring, _variables_categorical_docstring) from feature_engine._docstrings.init_parameters.encoders import ( - _ignore_format_docstring, - _unseen_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _inverse_transform_docstring, - _transform_encoders_docstring, -) + _ignore_format_docstring, _unseen_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _inverse_transform_docstring, + _transform_encoders_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.encoding._helper_functions import check_parameter_unseen -from feature_engine.encoding.base_encoder import ( - CategoricalInitMixinNA, - CategoricalMethodsMixin, -) +from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA, + CategoricalMethodsMixin) _unseen_docstring = ( _unseen_docstring + """ If `'encode'`, unseen categories will be encoded as 0 (zero).""" + + """ If `'warn'`, unseen categories will be encoded as NaN and a""" + + """ UserWarning is raised listing the unseen categories per variable.""" ) @@ -166,7 +158,7 @@ def __init__( f"Got {encoding_method} instead." ) - check_parameter_unseen(unseen, ["ignore", "raise", "encode"]) + check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) super().__init__(variables, missing_values, ignore_format) self.encoding_method = encoding_method self.unseen = unseen diff --git a/test_results.txt b/test_results.txt new file mode 100644 index 0000000000000000000000000000000000000000..c032dbfd42a24aa8e2dfe7c18d787ab9dcb783ad GIT binary patch literal 13284 zcmds;QBNC35XbkqQoq9qsY)v~G{%5Kq&!3ksVem$O4PnU>k!)zH^!!S20}l4+y8He z<@Vw`u#M?bbh3T7ySF?4*_qkd+3`PrKX=PsnVY(SEA*Y|o4djd-NcRk^VA) zmL1`_``UfwPTWX+P2HjT&fT&4JJ&d4*LEAaf2=EWZ`_4ie7qdJ?bcn(ZR-1p`(Cvt zzSd}c>~?(ly56?Bkvnxqx-QgHOLg16KJ@(3J#+h&{@cFq{mWi$%yr@atPk||L~T#B z3|wDy=6XL=FovN2(jXDw4Q`ORs&4lr0A{w9*su=&sb7x(xGL^-(@GsN5%<@pJ2kN)*^wU$2 zy~6!)b#EM^7dT1ZaUJESDxHdxhn}N<>KUH)ggt(Dpjw-;XW~v6i59u%YOdBp^%1{g z<8?iDlf=I2#~PFIPW-#a9tZtVcP73fdjq$pyWD;7wYk5)aK8!l8~4J!)3xhf>1j`& zOx*ZJbq|jNuRS-gBTdLRA1LxxBjV0ILyYz?L+DIb+BJ}D7Yj%^8hoO89_xcuX`1Ps z2ZsA5*>1_w-Y>1b?MyexG^_H%LRzi~g*=jaiEhcKc(|xH8&Rj#V^(DR23B9z*8^Ff zuD;DrJyI=x3jV-e)S{4gQ$LFviM4cDJWaZ|A$d(Jm3UtGYWTJ-eej5K&Dnu`A$-KH z_w@0m@R)=5#*SjvGulY3V+Y>f#mB~DABtYQyOH-SSu=yrJByMP)V$EZ3-&jLde%>^ z$Nu!5%w+I_&;9-fFM+k5Uuy8XO(Xldt3%S3_NA4{Wj%a$XvipbW$6CW6Fz>7dVVjz zom?~Tk_$I^Lv6S9fMuY`8n<0q9X^+y=3aK$H)DB;V4$<;s@Y%G$^$Rc*sQ9Lh`nQH zdn6seh7)b6)9}rEq~UjmG3_p$s6#bcG4*polnZ~=Y||>x<4=vfrK?KA(C-W5RjpnY z(rS8GjqnYj7@4i|&7wR_8o=7NCw_*v>;1+}MaWXFX{$j=j1jp)dg^g*CD!cK;EZLq z31?dEUJcG*@0&2D-KS%(uq=hv9*g>SBR#tdE!|42rK?9gkH)6mp|2kE&Dy)2co)BV z%-3u3cH)Q|J0Z@u;yrHIC7N!tM;yrCv{SKpe8}y@d;NJZRsEql?>L+xRa?&IIc@6{ zGo?=vcJ-ycEi$LqA&;$AozK`wtiLgRv#}f^9JXGu8;avs^F?wxiFf`)FgTMumQGsi zh8QX9ExSV^a${fn!E1H>4zH6~JHLz~LOuFNrh>(PzqB+LwL8^L;vG2}b#}>TIA6DN zlzTYa^ToELPJ5mv_R$x^RMoYcv89b?FNUqOoXqXb*wXHl=fPI)gV8wTv5feJoQLwX ztX?|P#=zoPz;;_i49~?fH^6l&I|7PO3unk-nLUx%NvvzlXv%XJv+Oos?6>N1q}}f} z;rQfb?A%LPy?+(!7}0oEHzM1XeZ<*JJ>Ims)`ZtAd{yq^IPH8Zz)s?QZWW!`6;8^A zK8jf8L{<0unR~5VsGr==SGiCvX;*nxL_XB=S%&4C+G2B3X4?@^d>r^N=2Ij}`8c9mn-j zT8@-WB{S%HoaMKxawz7^Id|Vy*4^iBP3O*JCdIjVYwkPGnd@>f_tjhIH*n~BzJ%<; zmblQ0_`rCF(m4kaa) zf!^WWwX4-`O|jqQ_|f&}Nh48i)E;`?TNWsuLUEOGiDr|R1X&T$%e-7%kIs zS9pm=F!!|H)i_*rEui6z_W{Tp34G4n`-;@OS1V4%KB)!!A+XrWQra@gW8TBll0Ked zp7R{6JXx_ns!wvPXJylPgpQIY+!qJI#OIZ0W1I}sl)t~b z6c?NcdhV}N&8JzzPo z7swX+6MANxM+@`yQAPh5+-22a16k=Y>`bkJr)wc_p`(I}E$67hRhXw76ayQd@pNV5w4syc!eudrirD%xh W6B2IgAOE-D1erBt(Xs6RGyeh< Date: Sun, 8 Mar 2026 23:32:34 +0530 Subject: [PATCH 07/16] chore: remove accidental test_results.txt file --- test_results.txt | Bin 13284 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test_results.txt diff --git a/test_results.txt b/test_results.txt deleted file mode 100644 index c032dbfd42a24aa8e2dfe7c18d787ab9dcb783ad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13284 zcmds;QBNC35XbkqQoq9qsY)v~G{%5Kq&!3ksVem$O4PnU>k!)zH^!!S20}l4+y8He z<@Vw`u#M?bbh3T7ySF?4*_qkd+3`PrKX=PsnVY(SEA*Y|o4djd-NcRk^VA) zmL1`_``UfwPTWX+P2HjT&fT&4JJ&d4*LEAaf2=EWZ`_4ie7qdJ?bcn(ZR-1p`(Cvt zzSd}c>~?(ly56?Bkvnxqx-QgHOLg16KJ@(3J#+h&{@cFq{mWi$%yr@atPk||L~T#B z3|wDy=6XL=FovN2(jXDw4Q`ORs&4lr0A{w9*su=&sb7x(xGL^-(@GsN5%<@pJ2kN)*^wU$2 zy~6!)b#EM^7dT1ZaUJESDxHdxhn}N<>KUH)ggt(Dpjw-;XW~v6i59u%YOdBp^%1{g z<8?iDlf=I2#~PFIPW-#a9tZtVcP73fdjq$pyWD;7wYk5)aK8!l8~4J!)3xhf>1j`& zOx*ZJbq|jNuRS-gBTdLRA1LxxBjV0ILyYz?L+DIb+BJ}D7Yj%^8hoO89_xcuX`1Ps z2ZsA5*>1_w-Y>1b?MyexG^_H%LRzi~g*=jaiEhcKc(|xH8&Rj#V^(DR23B9z*8^Ff zuD;DrJyI=x3jV-e)S{4gQ$LFviM4cDJWaZ|A$d(Jm3UtGYWTJ-eej5K&Dnu`A$-KH z_w@0m@R)=5#*SjvGulY3V+Y>f#mB~DABtYQyOH-SSu=yrJByMP)V$EZ3-&jLde%>^ z$Nu!5%w+I_&;9-fFM+k5Uuy8XO(Xldt3%S3_NA4{Wj%a$XvipbW$6CW6Fz>7dVVjz zom?~Tk_$I^Lv6S9fMuY`8n<0q9X^+y=3aK$H)DB;V4$<;s@Y%G$^$Rc*sQ9Lh`nQH zdn6seh7)b6)9}rEq~UjmG3_p$s6#bcG4*polnZ~=Y||>x<4=vfrK?KA(C-W5RjpnY z(rS8GjqnYj7@4i|&7wR_8o=7NCw_*v>;1+}MaWXFX{$j=j1jp)dg^g*CD!cK;EZLq z31?dEUJcG*@0&2D-KS%(uq=hv9*g>SBR#tdE!|42rK?9gkH)6mp|2kE&Dy)2co)BV z%-3u3cH)Q|J0Z@u;yrHIC7N!tM;yrCv{SKpe8}y@d;NJZRsEql?>L+xRa?&IIc@6{ zGo?=vcJ-ycEi$LqA&;$AozK`wtiLgRv#}f^9JXGu8;avs^F?wxiFf`)FgTMumQGsi zh8QX9ExSV^a${fn!E1H>4zH6~JHLz~LOuFNrh>(PzqB+LwL8^L;vG2}b#}>TIA6DN zlzTYa^ToELPJ5mv_R$x^RMoYcv89b?FNUqOoXqXb*wXHl=fPI)gV8wTv5feJoQLwX ztX?|P#=zoPz;;_i49~?fH^6l&I|7PO3unk-nLUx%NvvzlXv%XJv+Oos?6>N1q}}f} z;rQfb?A%LPy?+(!7}0oEHzM1XeZ<*JJ>Ims)`ZtAd{yq^IPH8Zz)s?QZWW!`6;8^A zK8jf8L{<0unR~5VsGr==SGiCvX;*nxL_XB=S%&4C+G2B3X4?@^d>r^N=2Ij}`8c9mn-j zT8@-WB{S%HoaMKxawz7^Id|Vy*4^iBP3O*JCdIjVYwkPGnd@>f_tjhIH*n~BzJ%<; zmblQ0_`rCF(m4kaa) zf!^WWwX4-`O|jqQ_|f&}Nh48i)E;`?TNWsuLUEOGiDr|R1X&T$%e-7%kIs zS9pm=F!!|H)i_*rEui6z_W{Tp34G4n`-;@OS1V4%KB)!!A+XrWQra@gW8TBll0Ked zp7R{6JXx_ns!wvPXJylPgpQIY+!qJI#OIZ0W1I}sl)t~b z6c?NcdhV}N&8JzzPo z7swX+6MANxM+@`yQAPh5+-22a16k=Y>`bkJr)wc_p`(I}E$67hRhXw76ayQd@pNV5w4syc!eudrirD%xh W6B2IgAOE-D1erBt(Xs6RGyeh< Date: Mon, 9 Mar 2026 00:07:22 +0530 Subject: [PATCH 08/16] feat(AddMissingIndicator, DropMissingData): raise error when variables and missing_only=True are used together (#905) --- .../imputation/drop_missing_data.py | 18 ++++++---- .../imputation/missing_indicator.py | 16 ++++++--- .../test_imputation/test_drop_missing_data.py | 36 ++++++++++++++----- .../test_imputation/test_missing_indicator.py | 32 ++++++++++++++--- 4 files changed, 78 insertions(+), 24 deletions(-) diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py index 07c6f3e75..56e77156a 100644 --- a/feature_engine/imputation/drop_missing_data.py +++ b/feature_engine/imputation/drop_missing_data.py @@ -45,15 +45,14 @@ class DropMissingData(BaseImputer, TransformXyMixin): will check missing data in all variables in the dataframe. Alternatively, the imputer will evaluate missing data only in the variables in the list. - Note that if `missing_only=True`, missing data will be removed from variables - that had missing data in the train set. These might be a subset of the - variables indicated in the list. + If a list of variables is provided, ``missing_only`` must be set to ``False``. missing_only: bool, default=True If `True`, rows will be dropped when they show missing data in variables that - had missing data during `fit()`. If `False`, rows will be dropped if there is - missing data in any of the variables. This parameter only works when - `threshold=None`, otherwise it is ignored. + had missing data during `fit()`. Only valid when ``variables=None``. If + `False`, rows will be dropped if there is missing data in any of the + variables. This parameter only works when `threshold=None`, otherwise it is + ignored. threshold: int or float, default=None Require that percentage of non-NA values in a row to keep it. If @@ -131,6 +130,13 @@ def __init__( self.missing_only = missing_only self.threshold = threshold + if self.variables is not None and missing_only is True: + raise ValueError( + "variables and missing_only cannot be used together. " + "Set variables=None to use missing_only=True, or set " + "missing_only=False to pass a list of variables." + ) + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Find the variables for which missing data should be evaluated to decide if a diff --git a/feature_engine/imputation/missing_indicator.py b/feature_engine/imputation/missing_indicator.py index 01660a654..d7d5fffef 100644 --- a/feature_engine/imputation/missing_indicator.py +++ b/feature_engine/imputation/missing_indicator.py @@ -50,14 +50,15 @@ class AddMissingIndicator(BaseImputer): data or to all variables. **True**: indicators will be created only for those variables that showed - missing data during `fit()`. + missing data during `fit()`. Only valid when ``variables=None``. - **False**: indicators will be created for all variables + **False**: indicators will be created for all variables passed in + ``variables``, or all variables in the dataset if ``variables=None``. variables: list, default=None The list of variables to impute. If None, the imputer will find and - select all variables. - + select all variables. If a list of variables is provided, + ``missing_only`` must be set to ``False``. Attributes ---------- @@ -111,6 +112,13 @@ def __init__( self.variables = _check_variables_input_value(variables) self.missing_only = missing_only + if self.variables is not None and missing_only is True: + raise ValueError( + "variables and missing_only cannot be used together. " + "Set variables=None to use missing_only=True, or set " + "missing_only=False to pass a list of variables." + ) + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the variables for which the missing indicators will be created. diff --git a/tests/test_imputation/test_drop_missing_data.py b/tests/test_imputation/test_drop_missing_data.py index ee49fee82..ddbf6b8af 100644 --- a/tests/test_imputation/test_drop_missing_data.py +++ b/tests/test_imputation/test_drop_missing_data.py @@ -48,20 +48,18 @@ def test_selelct_all_variables_when_variables_is_none(df_na): def test_detect_variables_with_na_in_variables_entered_by_user(df_na): imputer = DropMissingData( - missing_only=True, variables=["City", "Studies", "Age", "dob"] + missing_only=False, variables=["City", "Studies", "Age", "dob"] ) X_transformed = imputer.fit_transform(df_na) assert imputer.variables == ["City", "Studies", "Age", "dob"] - assert imputer.variables_ == ["City", "Studies", "Age"] + assert imputer.variables_ == ["City", "Studies", "Age", "dob"] assert X_transformed.shape == (6, 6) def test_return_na_data_method(df_na): # test with vars - imputer = DropMissingData( - threshold=0.5, variables=["City", "Studies", "Age", "Marks"] - ) + imputer = DropMissingData(missing_only=False, threshold=0.5, variables=["City", "Studies", "Age", "Marks"]) imputer.fit_transform(df_na) X_nona = imputer.return_na_data(df_na) assert list(X_nona.index) == [2, 3] @@ -125,13 +123,33 @@ def test_threshold_value_error(df_na): def test_threshold_with_variables(df_na): # Each row must have 100% data avaiable for columns ['Marks'] - imputer = DropMissingData(threshold=1, variables=["Marks"]) + imputer = DropMissingData(missing_only=False, threshold=1, variables=["Marks"]) X = imputer.fit_transform(df_na) assert list(X.index) == [0, 1, 2, 4, 6, 7] # Each row must have 25% data avaiable for ['City', 'Studies', 'Age', 'Marks'] - imputer = DropMissingData( - threshold=0.75, variables=["City", "Studies", "Age", "Marks"] - ) + imputer = DropMissingData(missing_only=False, threshold=0.75, variables=["City", "Studies", "Age", "Marks"]) X = imputer.fit_transform(df_na) assert list(X.index) == [0, 1, 4, 5, 6, 7] + + +# --------------------------------------------------------------------------- +# Tests for variables + missing_only mutual exclusivity +# --------------------------------------------------------------------------- + +def test_error_when_variables_and_missing_only_true(): + """Passing both variables and missing_only=True should raise ValueError.""" + with pytest.raises(ValueError, match="variables and missing_only"): + DropMissingData(missing_only=True, variables=["Age", "Name"]) + + +def test_no_error_when_variables_and_missing_only_false(): + """variables + missing_only=False is valid — should not raise.""" + imputer = DropMissingData(missing_only=False, variables=["Age"]) + assert imputer.variables is not None + + +def test_no_error_when_variables_none_and_missing_only_true(): + """variables=None + missing_only=True is valid — default case.""" + imputer = DropMissingData(missing_only=True, variables=None) + assert imputer.missing_only is True diff --git a/tests/test_imputation/test_missing_indicator.py b/tests/test_imputation/test_missing_indicator.py index a7f6e9f7c..24d3626a7 100644 --- a/tests/test_imputation/test_missing_indicator.py +++ b/tests/test_imputation/test_missing_indicator.py @@ -35,7 +35,7 @@ def test_add_indicators_to_all_variables_when_variables_is_none(df_na): def test_add_indicators_to_one_variable(df_na): - imputer = AddMissingIndicator(variables="Name") + imputer = AddMissingIndicator(missing_only=False, variables="Name") X_transformed = imputer.fit_transform(df_na) assert imputer.variables_ == ["Name"] assert X_transformed.shape == (8, 7) @@ -45,14 +45,14 @@ def test_add_indicators_to_one_variable(df_na): def test_detect_variables_with_missing_data_in_variables_entered_by_user(df_na): imputer = AddMissingIndicator( - missing_only=True, variables=["City", "Studies", "Age", "dob"] + missing_only=False, variables=["City", "Studies", "Age", "dob"] ) X_transformed = imputer.fit_transform(df_na) assert imputer.variables == ["City", "Studies", "Age", "dob"] - assert imputer.variables_ == ["City", "Studies", "Age"] - assert X_transformed.shape == (8, 9) + assert imputer.variables_ == ["City", "Studies", "Age", "dob"] + assert X_transformed.shape == (8, 10) assert "City_na" in X_transformed.columns - assert "dob_na" not in X_transformed.columns + assert "dob_na" in X_transformed.columns assert X_transformed["City_na"].sum() == 2 @@ -123,3 +123,25 @@ def test_no_performance_warning_with_many_variables(): issubclass(w.category, pd.errors.PerformanceWarning) for w in captured ), "PerformanceWarning was raised during transform" + + +# --------------------------------------------------------------------------- +# Tests for variables + missing_only mutual exclusivity +# --------------------------------------------------------------------------- + +def test_error_when_variables_and_missing_only_true(): + """Passing both variables and missing_only=True should raise ValueError.""" + with pytest.raises(ValueError, match="variables and missing_only"): + AddMissingIndicator(missing_only=True, variables=["Age", "Name"]) + + +def test_no_error_when_variables_and_missing_only_false(): + """variables + missing_only=False is valid — should not raise.""" + imputer = AddMissingIndicator(missing_only=False, variables=["Age"]) + assert imputer.variables is not None + + +def test_no_error_when_variables_none_and_missing_only_true(): + """variables=None + missing_only=True is valid — default case.""" + imputer = AddMissingIndicator(missing_only=True, variables=None) + assert imputer.missing_only is True From dee65fc283ad0b68fd18e7f8762cf79abd6fbf70 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 13 Mar 2026 15:05:17 +0530 Subject: [PATCH 09/16] fixing the ci/circleci:test_style --- .gitignore | 1 + tests/test_imputation/test_missing_indicator.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3ba72acd9..0096d1595 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule # Environments .env .venv +.venv_wsl env/ venv/ ENV/ diff --git a/tests/test_imputation/test_missing_indicator.py b/tests/test_imputation/test_missing_indicator.py index 24d3626a7..2dd7af89e 100644 --- a/tests/test_imputation/test_missing_indicator.py +++ b/tests/test_imputation/test_missing_indicator.py @@ -1,8 +1,8 @@ import warnings + import numpy as np import pandas as pd import pytest - from sklearn.pipeline import Pipeline from feature_engine.imputation import AddMissingIndicator From 0983cd609779d57e5d2da6d12a6b25dc2f382fc0 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 13 Mar 2026 15:15:39 +0530 Subject: [PATCH 10/16] fixing the ci/circleci:test_style --- tests/test_imputation/test_drop_missing_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_imputation/test_drop_missing_data.py b/tests/test_imputation/test_drop_missing_data.py index ddbf6b8af..413e6a279 100644 --- a/tests/test_imputation/test_drop_missing_data.py +++ b/tests/test_imputation/test_drop_missing_data.py @@ -59,7 +59,9 @@ def test_detect_variables_with_na_in_variables_entered_by_user(df_na): def test_return_na_data_method(df_na): # test with vars - imputer = DropMissingData(missing_only=False, threshold=0.5, variables=["City", "Studies", "Age", "Marks"]) + imputer = DropMissingData( + missing_only=False, threshold=0.5, variables=["City", "Studies", "Age", "Marks"] + ) imputer.fit_transform(df_na) X_nona = imputer.return_na_data(df_na) assert list(X_nona.index) == [2, 3] @@ -128,7 +130,11 @@ def test_threshold_with_variables(df_na): assert list(X.index) == [0, 1, 2, 4, 6, 7] # Each row must have 25% data avaiable for ['City', 'Studies', 'Age', 'Marks'] - imputer = DropMissingData(missing_only=False, threshold=0.75, variables=["City", "Studies", "Age", "Marks"]) + imputer = DropMissingData( + missing_only=False, + threshold=0.75, + variables=["City", "Studies", "Age", "Marks"] + ) X = imputer.fit_transform(df_na) assert list(X.index) == [0, 1, 4, 5, 6, 7] From 6ff46aa50067ce89e3e6e91f5268e87e1a99e735 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 16 Mar 2026 15:18:19 +0530 Subject: [PATCH 11/16] test signed commit --- tests/test_imputation/test_categorical_imputer.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index de4ce0bc4..2f79f8b44 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -403,3 +403,16 @@ def test_errors_ignore_multiple_variables(): imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] assert imputer.imputer_dict_["country"] == X["country"].mode()[0] + + +def test_errors_warn_single_variable(): + """errors='warn' on single multimodal variable — warns, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer( + imputation_method="frequent", variables="city", errors="warn" + ) + with pytest.warns(UserWarning, match="Variable city has multiple frequent"): + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] \ No newline at end of file From 289c1d89ea64aa77b475ad4d96adbec309605914 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 16 Mar 2026 15:19:59 +0530 Subject: [PATCH 12/16] test: add warn coverage for single multimodal variable in CategoricalImputer --- tests/test_imputation/test_categorical_imputer.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 2f79f8b44..de4ce0bc4 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -403,16 +403,3 @@ def test_errors_ignore_multiple_variables(): imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] assert imputer.imputer_dict_["country"] == X["country"].mode()[0] - - -def test_errors_warn_single_variable(): - """errors='warn' on single multimodal variable — warns, uses first mode.""" - X = pd.DataFrame( - {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} - ) - imputer = CategoricalImputer( - imputation_method="frequent", variables="city", errors="warn" - ) - with pytest.warns(UserWarning, match="Variable city has multiple frequent"): - imputer.fit(X) - assert imputer.imputer_dict_["city"] == X["city"].mode()[0] \ No newline at end of file From 0bfff51c4448988a246f272910ebc383016bce3e Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 16 Mar 2026 15:38:34 +0530 Subject: [PATCH 13/16] test: cover single-variable warn branch in CategoricalImputer --- tests/test_imputation/test_categorical_imputer.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index de4ce0bc4..1c0640a58 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -403,3 +403,16 @@ def test_errors_ignore_multiple_variables(): imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] assert imputer.imputer_dict_["country"] == X["country"].mode()[0] + + +def test_errors_warn_single_variable(): + """errors='warn' on single multimodal variable — warns, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city"], errors="warn" + ) + with pytest.warns(UserWarning, match="Variable city has multiple frequent"): + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] From 213acab2221dc1b734d7df30f04ccfb75c924425 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 16 Mar 2026 15:48:52 +0530 Subject: [PATCH 14/16] removed the .venv_wsl as was worngly commited --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0096d1595..3ba72acd9 100644 --- a/.gitignore +++ b/.gitignore @@ -86,7 +86,6 @@ celerybeat-schedule # Environments .env .venv -.venv_wsl env/ venv/ ENV/ From 98a41749d38890232abc20909f9f7f2dbf0cc83f Mon Sep 17 00:00:00 2001 From: DirekKakkar_17 Date: Mon, 16 Mar 2026 15:50:53 +0530 Subject: [PATCH 15/16] Delete test_results_utf8.txt --- test_results_utf8.txt | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 test_results_utf8.txt diff --git a/test_results_utf8.txt b/test_results_utf8.txt deleted file mode 100644 index 2dd401b35..000000000 --- a/test_results_utf8.txt +++ /dev/null @@ -1,21 +0,0 @@ -.........................................F [100%] -================================== FAILURES =================================== -______________________ test_unseen_invalid_value_raises _______________________ -tests\test_encoding\test_count_frequency_encoder.py:537: in test_unseen_invalid_value_raises - CountFrequencyEncoder(unseen="bad_value") -feature_engine\encoding\count_frequency.py:171: in __init__ - check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) -feature_engine\encoding\_helper_functions.py:10: in check_parameter_unseen - raise ValueError( -E ValueError: Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead. - -During handling of the above exception, another exception occurred: -tests\test_encoding\test_count_frequency_encoder.py:536: in test_unseen_invalid_value_raises - with pytest.raises(ValueError, match="unseen takes only values"): - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -E AssertionError: Regex pattern did not match. -E Expected regex: 'unseen takes only values' -E Actual message: 'Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead.' -=========================== short test summary info =========================== -FAILED tests/test_encoding/test_count_frequency_encoder.py::test_unseen_invalid_value_raises -1 failed, 41 passed in 0.51s From dc34edf983833c718e128ca342f52bc3d63c3ba9 Mon Sep 17 00:00:00 2001 From: DirekKakkar_17 Date: Mon, 16 Mar 2026 15:51:16 +0530 Subject: [PATCH 16/16] Delete fail_detail.txt --- fail_detail.txt | 77 ------------------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 fail_detail.txt diff --git a/fail_detail.txt b/fail_detail.txt deleted file mode 100644 index 514d0fb79..000000000 --- a/fail_detail.txt +++ /dev/null @@ -1,77 +0,0 @@ -============================= test session starts ============================= -platform win32 -- Python 3.14.0, pytest-9.0.2, pluggy-1.6.0 -rootdir: F:\feature_engine -configfile: pyproject.toml -plugins: anyio-4.12.1, dash-4.0.0, cov-7.0.0, timeout-2.4.0 -collected 1 item - -tests\test_encoding\test_count_frequency_encoder.py F - -================================== FAILURES =================================== -______________________ test_unseen_invalid_value_raises _______________________ - - def test_unseen_invalid_value_raises(): - """Invalid unseen value should raise ValueError at init.""" - with pytest.raises(ValueError, match="unseen takes only values"): -> CountFrequencyEncoder(unseen="bad_value") - -tests\test_encoding\test_count_frequency_encoder.py:537: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = <[AttributeError("'CountFrequencyEncoder' object has no attribute 'encoding_method'") raised in repr()] CountFrequencyEncoder object at 0x11445ccaf90> -encoding_method = 'count', variables = None, missing_values = 'raise' -ignore_format = False, unseen = 'bad_value' - - def __init__( - self, - encoding_method: str = "count", - variables: Union[None, int, str, List[Union[str, int]]] = None, - missing_values: str = "raise", - ignore_format: bool = False, - unseen: str = "ignore", - ) -> None: - - if encoding_method not in ["count", "frequency"]: - raise ValueError( - "encoding_method takes only values 'count' and 'frequency'. " - f"Got {encoding_method} instead." - ) - -> check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"]) - -feature_engine\encoding\count_frequency.py:171: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -unseen = 'bad_value', accepted_values = ['ignore', 'raise', 'encode', 'warn'] - - def check_parameter_unseen(unseen, accepted_values): - if not isinstance(accepted_values, list) or not all( - isinstance(item, str) for item in accepted_values - ): - raise ValueError( - "accepted_values should be a list of strings. " - f" Got {accepted_values} instead." - ) - if unseen not in accepted_values: -> raise ValueError( - f"Parameter `unseen` takes only values {', '.join(accepted_values)}." - f" Got {unseen} instead." - ) -E ValueError: Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead. - -feature_engine\encoding\_helper_functions.py:10: ValueError - -During handling of the above exception, another exception occurred: - - def test_unseen_invalid_value_raises(): - """Invalid unseen value should raise ValueError at init.""" -> with pytest.raises(ValueError, match="unseen takes only values"): - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -E AssertionError: Regex pattern did not match. -E Expected regex: 'unseen takes only values' -E Actual message: 'Parameter `unseen` takes only values ignore, raise, encode, warn. Got bad_value instead.' - -tests\test_encoding\test_count_frequency_encoder.py:536: AssertionError -=========================== short test summary info =========================== -FAILED tests/test_encoding/test_count_frequency_encoder.py::test_unseen_invalid_value_raises -============================== 1 failed in 0.28s ==============================