feature-engine · direkkakkar319-ops · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst
@@ -53,6 +53,8 @@ New transformers
 Enhancements
 ~~~~~~~~~~~~
 
+- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
+- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar <https://github.com/DirekKakkar>`_)
 - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)
 
 BUG

diff --git a/feature_engine/encoding/base_encoder.py b/feature_engine/encoding/base_encoder.py
@@ -6,27 +6,21 @@
 from sklearn.utils.validation import check_is_fitted
 
 from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
-from feature_engine._check_init_parameters.check_variables import (
-    _check_variables_input_value,
-)
+from feature_engine._check_init_parameters.check_variables import \
+    _check_variables_input_value
 from feature_engine._docstrings.init_parameters.all_trasnformers import (
-    _missing_values_docstring,
-    _variables_categorical_docstring,
-)
-from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
+    _missing_values_docstring, _variables_categorical_docstring)
+from feature_engine._docstrings.init_parameters.encoders import \
+    _ignore_format_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.dataframe_checks import (
-    _check_optional_contains_na,
-    _check_X_matches_training_df,
-    check_X,
-)
+from feature_engine.dataframe_checks import (_check_optional_contains_na,
+                                             _check_X_matches_training_df,
+                                             check_X)
 from feature_engine.tags import _return_tags
-from feature_engine.variable_handling import (
-    check_all_variables,
-    check_categorical_variables,
-    find_all_variables,
-    find_categorical_variables,
-)
+from feature_engine.variable_handling import (check_all_variables,
+                                              check_categorical_variables,
+                                              find_all_variables,
+                                              find_categorical_variables)
 
 
 @Substitution(
@@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
     def _encode(self, X: pd.DataFrame) -> pd.DataFrame:
         # replace categories by the learned parameters
         for feature in self.encoder_dict_.keys():
+            # Detect unseen categories BEFORE mapping so we can name them
+            if self.unseen == "warn":
+                unseen_cats = set(X[feature].dropna().unique()) - set(
+                    self.encoder_dict_[feature].keys()
+                )
+                if unseen_cats:
+                    warnings.warn(
+                        f"Variable {feature!r} contains unseen categories: "
+                        f"{unseen_cats}. These will be encoded as NaN.",
+                        UserWarning,
+                    )
+
             X[feature] = X[feature].map(self.encoder_dict_[feature])
 
             # if original variables are cast as categorical, they will remain
@@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X):
                     "During the encoding, NaN values were introduced in the feature(s) "
                     f"{nan_columns_str}."
                 )
+            # 'warn': per-variable warnings were already issued in _encode before
+            # the mapping, so nothing more to do here.
 
     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """Convert the encoded variable back to the original values.

diff --git a/feature_engine/encoding/count_frequency.py b/feature_engine/encoding/count_frequency.py
@@ -6,34 +6,26 @@
 import pandas as pd
 
 from feature_engine._docstrings.fit_attributes import (
-    _feature_names_in_docstring,
-    _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
+    _feature_names_in_docstring, _n_features_in_docstring,
+    _variables_attribute_docstring)
 from feature_engine._docstrings.init_parameters.all_trasnformers import (
-    _missing_values_docstring,
-    _variables_categorical_docstring,
-)
+    _missing_values_docstring, _variables_categorical_docstring)
 from feature_engine._docstrings.init_parameters.encoders import (
-    _ignore_format_docstring,
-    _unseen_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _inverse_transform_docstring,
-    _transform_encoders_docstring,
-)
+    _ignore_format_docstring, _unseen_docstring)
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _inverse_transform_docstring,
+                                                _transform_encoders_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.encoding._helper_functions import check_parameter_unseen
-from feature_engine.encoding.base_encoder import (
-    CategoricalInitMixinNA,
-    CategoricalMethodsMixin,
-)
+from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA,
+                                                  CategoricalMethodsMixin)
 
 _unseen_docstring = (
     _unseen_docstring
     + """ If `'encode'`, unseen categories will be encoded as 0 (zero)."""
+    + """ If `'warn'`, unseen categories will be encoded as NaN and a"""
+    + """ UserWarning is raised listing the unseen categories per variable."""
 )
 
 
@@ -166,7 +158,7 @@ def __init__(
                 f"Got {encoding_method} instead."
             )
 
-        check_parameter_unseen(unseen, ["ignore", "raise", "encode"])
+        check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"])
         super().__init__(variables, missing_values, ignore_format)
         self.encoding_method = encoding_method
         self.unseen = unseen

diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py
@@ -1,33 +1,26 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import pandas as pd
 
-from feature_engine._check_init_parameters.check_variables import (
-    _check_variables_input_value,
-)
+from feature_engine._check_init_parameters.check_variables import \
+    _check_variables_input_value
 from feature_engine._docstrings.fit_attributes import (
-    _feature_names_in_docstring,
-    _imputer_dict_docstring,
-    _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _transform_imputers_docstring,
-)
+    _feature_names_in_docstring, _imputer_dict_docstring,
+    _n_features_in_docstring, _variables_attribute_docstring)
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _transform_imputers_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.imputation.base_imputer import BaseImputer
 from feature_engine.tags import _return_tags
-from feature_engine.variable_handling import (
-    check_all_variables,
-    check_categorical_variables,
-    find_all_variables,
-    find_categorical_variables,
-)
+from feature_engine.variable_handling import (check_all_variables,
+                                              check_categorical_variables,
+                                              find_all_variables,
+                                              find_categorical_variables)
 
 
 @Substitution(
@@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
         type object or categorical. If True, the imputer will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    errors : str, default='raise'
+        Indicates what to do when the selected imputation_method='frequent'
+        and a variable has more than 1 mode.
+
+        If 'raise', raises a ValueError and stops the fit.
+
+        If 'warn', raises a UserWarning and continues, imputing using the
+        first most frequent category found.
+
+        If 'ignore', continues without warnings, imputing using the first
+        most frequent category found.
+
     Attributes
     ----------
     {imputer_dict_}
@@ -135,6 +140,7 @@ def __init__(
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         return_object: bool = False,
         ignore_format: bool = False,
+        errors: str = "raise",
     ) -> None:
         if imputation_method not in ["missing", "frequent"]:
             raise ValueError(
@@ -144,11 +150,18 @@ def __init__(
         if not isinstance(ignore_format, bool):
             raise ValueError("ignore_format takes only booleans True and False")
 
+        if errors not in ("raise", "warn", "ignore"):
+            raise ValueError(
+                "errors takes only values 'raise', 'warn', or 'ignore'. "
+                f"Got {errors} instead."
+            )
+
         self.imputation_method = imputation_method
         self.fill_value = fill_value
         self.variables = _check_variables_input_value(variables)
         self.return_object = return_object
         self.ignore_format = ignore_format
+        self.errors = errors
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
                 # Some variables may contain more than 1 mode:
                 if len(mode_vals) > 1:
-                    raise ValueError(
-                        f"The variable {var} contains multiple frequent categories."
-                    )
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable {var} contains multiple "
+                            f"frequent categories. Set errors='warn' or "
+                            f"errors='ignore' to allow imputation using "
+                            f"the first most frequent category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable {var} has multiple frequent "
+                            f"categories. The first category found, "
+                            f"{mode_vals[0]}, will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = {var: mode_vals[0]}
 
@@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                         varnames_str = ", ".join(varnames)
                     else:
                         varnames_str = varnames[0]
-                    raise ValueError(
-                        f"The variable(s) {varnames_str} contain(s) multiple frequent "
-                        f"categories."
-                    )
+
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable(s) {varnames_str} contain(s) "
+                            f"multiple frequent categories. Set "
+                            f"errors='warn' or errors='ignore' to allow "
+                            f"imputation using the first most frequent "
+                            f"category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable(s) {varnames_str} have multiple "
+                            f"frequent categories. The first category "
+                            f"found will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = mode_vals.iloc[0].to_dict()
 

diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py
@@ -45,15 +45,14 @@ class DropMissingData(BaseImputer, TransformXyMixin):
         will check missing data in all variables in the dataframe. Alternatively, the
         imputer will evaluate missing data only in the variables in the list.
 
-        Note that if `missing_only=True`, missing data will be removed from variables
-        that had missing data in the train set. These might be a subset of the
-        variables indicated in the list.
+        If a list of variables is provided, ``missing_only`` must be set to ``False``.
 
     missing_only: bool, default=True
         If `True`, rows will be dropped when they show missing data in variables that
-        had missing data during `fit()`. If `False`, rows will be dropped if there is
-        missing data in any of the variables. This parameter only works when
-        `threshold=None`, otherwise it is ignored.
+        had missing data during `fit()`. Only valid when ``variables=None``. If
+        `False`, rows will be dropped if there is missing data in any of the
+        variables. This parameter only works when `threshold=None`, otherwise it is
+        ignored.
 
     threshold: int or float, default=None
         Require that percentage of non-NA values in a row to keep it. If
@@ -131,6 +130,13 @@ def __init__(
         self.missing_only = missing_only
         self.threshold = threshold
 
+        if self.variables is not None and missing_only is True:
+            raise ValueError(
+                "variables and missing_only cannot be used together. "
+                "Set variables=None to use missing_only=True, or set "
+                "missing_only=False to pass a list of variables."
+            )
+
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
         Find the variables for which missing data should be evaluated to decide if a

diff --git a/feature_engine/imputation/missing_indicator.py b/feature_engine/imputation/missing_indicator.py
@@ -50,14 +50,15 @@ class AddMissingIndicator(BaseImputer):
         data or to all variables.
 
         **True**: indicators will be created only for those variables that showed
-        missing data during `fit()`.
+        missing data during `fit()`. Only valid when ``variables=None``.
 
-        **False**: indicators will be created for all variables
+        **False**: indicators will be created for all variables passed in
+        ``variables``, or all variables in the dataset if ``variables=None``.
 
     variables: list, default=None
         The list of variables to impute. If None, the imputer will find and
-        select all variables.
-
+        select all variables. If a list of variables is provided,
+        ``missing_only`` must be set to ``False``.
 
     Attributes
     ----------
@@ -111,6 +112,13 @@ def __init__(
         self.variables = _check_variables_input_value(variables)
         self.missing_only = missing_only
 
+        if self.variables is not None and missing_only is True:
+            raise ValueError(
+                "variables and missing_only cannot be used together. "
+                "Set variables=None to use missing_only=True, or set "
+                "missing_only=False to pass a list of variables."
+            )
+
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
         Learn the variables for which the missing indicators will be created.