Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/whats_new/v_190.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ New transformers
Enhancements
~~~~~~~~~~~~

- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar <https://github.com/DirekKakkar>`_)
- Added ``unseen='warn'`` option to `CountFrequencyEncoder`: unseen categories are encoded as ``NaN`` and a ``UserWarning`` is raised listing the unseen categories per variable. (`DirekKakkar <https://github.com/DirekKakkar>`_)
- Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli <https://github.com/solegalli>`_)

BUG
Expand Down
44 changes: 26 additions & 18 deletions feature_engine/encoding/base_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,21 @@
from sklearn.utils.validation import check_is_fitted

from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring,
_variables_categorical_docstring,
)
from feature_engine._docstrings.init_parameters.encoders import _ignore_format_docstring
_missing_values_docstring, _variables_categorical_docstring)
from feature_engine._docstrings.init_parameters.encoders import \
_ignore_format_docstring
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import (
_check_optional_contains_na,
_check_X_matches_training_df,
check_X,
)
from feature_engine.dataframe_checks import (_check_optional_contains_na,
_check_X_matches_training_df,
check_X)
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
)
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)


@Substitution(
Expand Down Expand Up @@ -221,6 +215,18 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
def _encode(self, X: pd.DataFrame) -> pd.DataFrame:
# replace categories by the learned parameters
for feature in self.encoder_dict_.keys():
# Detect unseen categories BEFORE mapping so we can name them
if self.unseen == "warn":
unseen_cats = set(X[feature].dropna().unique()) - set(
self.encoder_dict_[feature].keys()
)
if unseen_cats:
warnings.warn(
f"Variable {feature!r} contains unseen categories: "
f"{unseen_cats}. These will be encoded as NaN.",
UserWarning,
)

X[feature] = X[feature].map(self.encoder_dict_[feature])

# if original variables are cast as categorical, they will remain
Expand Down Expand Up @@ -266,6 +272,8 @@ def _check_nan_values_after_transformation(self, X):
"During the encoding, NaN values were introduced in the feature(s) "
f"{nan_columns_str}."
)
# 'warn': per-variable warnings were already issued in _encode before
# the mapping, so nothing more to do here.

def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Convert the encoded variable back to the original values.
Expand Down
32 changes: 12 additions & 20 deletions feature_engine/encoding/count_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,26 @@
import pandas as pd

from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
_feature_names_in_docstring, _n_features_in_docstring,
_variables_attribute_docstring)
from feature_engine._docstrings.init_parameters.all_trasnformers import (
_missing_values_docstring,
_variables_categorical_docstring,
)
_missing_values_docstring, _variables_categorical_docstring)
from feature_engine._docstrings.init_parameters.encoders import (
_ignore_format_docstring,
_unseen_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_inverse_transform_docstring,
_transform_encoders_docstring,
)
_ignore_format_docstring, _unseen_docstring)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
_inverse_transform_docstring,
_transform_encoders_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.encoding._helper_functions import check_parameter_unseen
from feature_engine.encoding.base_encoder import (
CategoricalInitMixinNA,
CategoricalMethodsMixin,
)
from feature_engine.encoding.base_encoder import (CategoricalInitMixinNA,
CategoricalMethodsMixin)

_unseen_docstring = (
_unseen_docstring
+ """ If `'encode'`, unseen categories will be encoded as 0 (zero)."""
+ """ If `'warn'`, unseen categories will be encoded as NaN and a"""
+ """ UserWarning is raised listing the unseen categories per variable."""
)


Expand Down Expand Up @@ -166,7 +158,7 @@ def __init__(
f"Got {encoding_method} instead."
)

check_parameter_unseen(unseen, ["ignore", "raise", "encode"])
check_parameter_unseen(unseen, ["ignore", "raise", "encode", "warn"])
super().__init__(variables, missing_values, ignore_format)
self.encoding_method = encoding_method
self.unseen = unseen
Expand Down
86 changes: 61 additions & 25 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import pandas as pd

from feature_engine._check_init_parameters.check_variables import (
_check_variables_input_value,
)
from feature_engine._check_init_parameters.check_variables import \
_check_variables_input_value
from feature_engine._docstrings.fit_attributes import (
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_transform_imputers_docstring,
)
_feature_names_in_docstring, _imputer_dict_docstring,
_n_features_in_docstring, _variables_attribute_docstring)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
_transform_imputers_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.tags import _return_tags
from feature_engine.variable_handling import (
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
)
from feature_engine.variable_handling import (check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables)


@Substitution(
Expand Down Expand Up @@ -88,6 +81,18 @@ class CategoricalImputer(BaseImputer):
type object or categorical. If True, the imputer will select all variables or
accept all variables entered by the user, including those cast as numeric.

errors : str, default='raise'
Indicates what to do when the selected imputation_method='frequent'
and a variable has more than 1 mode.

If 'raise', raises a ValueError and stops the fit.

If 'warn', raises a UserWarning and continues, imputing using the
first most frequent category found.

If 'ignore', continues without warnings, imputing using the first
most frequent category found.

Attributes
----------
{imputer_dict_}
Expand Down Expand Up @@ -135,6 +140,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
return_object: bool = False,
ignore_format: bool = False,
errors: str = "raise",
) -> None:
if imputation_method not in ["missing", "frequent"]:
raise ValueError(
Expand All @@ -144,11 +150,18 @@ def __init__(
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")

if errors not in ("raise", "warn", "ignore"):
raise ValueError(
"errors takes only values 'raise', 'warn', or 'ignore'. "
f"Got {errors} instead."
)

self.imputation_method = imputation_method
self.fill_value = fill_value
self.variables = _check_variables_input_value(variables)
self.return_object = return_object
self.ignore_format = ignore_format
self.errors = errors

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -189,9 +202,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# Some variables may contain more than 1 mode:
if len(mode_vals) > 1:
raise ValueError(
f"The variable {var} contains multiple frequent categories."
)
if self.errors == "raise":
raise ValueError(
f"The variable {var} contains multiple "
f"frequent categories. Set errors='warn' or "
f"errors='ignore' to allow imputation using "
f"the first most frequent category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable {var} has multiple frequent "
f"categories. The first category found, "
f"{mode_vals[0]}, will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = {var: mode_vals[0]}

Expand All @@ -208,10 +232,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
varnames_str = ", ".join(varnames)
else:
varnames_str = varnames[0]
raise ValueError(
f"The variable(s) {varnames_str} contain(s) multiple frequent "
f"categories."
)

if self.errors == "raise":
raise ValueError(
f"The variable(s) {varnames_str} contain(s) "
f"multiple frequent categories. Set "
f"errors='warn' or errors='ignore' to allow "
f"imputation using the first most frequent "
f"category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable(s) {varnames_str} have multiple "
f"frequent categories. The first category "
f"found will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()

Expand Down
18 changes: 12 additions & 6 deletions feature_engine/imputation/drop_missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,14 @@ class DropMissingData(BaseImputer, TransformXyMixin):
will check missing data in all variables in the dataframe. Alternatively, the
imputer will evaluate missing data only in the variables in the list.

Note that if `missing_only=True`, missing data will be removed from variables
that had missing data in the train set. These might be a subset of the
variables indicated in the list.
If a list of variables is provided, ``missing_only`` must be set to ``False``.

missing_only: bool, default=True
If `True`, rows will be dropped when they show missing data in variables that
had missing data during `fit()`. If `False`, rows will be dropped if there is
missing data in any of the variables. This parameter only works when
`threshold=None`, otherwise it is ignored.
had missing data during `fit()`. Only valid when ``variables=None``. If
`False`, rows will be dropped if there is missing data in any of the
variables. This parameter only works when `threshold=None`, otherwise it is
ignored.

threshold: int or float, default=None
Require that percentage of non-NA values in a row to keep it. If
Expand Down Expand Up @@ -131,6 +130,13 @@ def __init__(
self.missing_only = missing_only
self.threshold = threshold

if self.variables is not None and missing_only is True:
raise ValueError(
"variables and missing_only cannot be used together. "
"Set variables=None to use missing_only=True, or set "
"missing_only=False to pass a list of variables."
)

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Find the variables for which missing data should be evaluated to decide if a
Expand Down
16 changes: 12 additions & 4 deletions feature_engine/imputation/missing_indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,15 @@ class AddMissingIndicator(BaseImputer):
data or to all variables.

**True**: indicators will be created only for those variables that showed
missing data during `fit()`.
missing data during `fit()`. Only valid when ``variables=None``.

**False**: indicators will be created for all variables
**False**: indicators will be created for all variables passed in
``variables``, or all variables in the dataset if ``variables=None``.

variables: list, default=None
The list of variables to impute. If None, the imputer will find and
select all variables.

select all variables. If a list of variables is provided,
``missing_only`` must be set to ``False``.

Attributes
----------
Expand Down Expand Up @@ -111,6 +112,13 @@ def __init__(
self.variables = _check_variables_input_value(variables)
self.missing_only = missing_only

if self.variables is not None and missing_only is True:
raise ValueError(
"variables and missing_only cannot be used together. "
"Set variables=None to use missing_only=True, or set "
"missing_only=False to pass a list of variables."
)

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Learn the variables for which the missing indicators will be created.
Expand Down
Loading