Skip to content

Test and doc for n_features_in_ for sklearn.calibration #19555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 26 additions & 22 deletions sklearn/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import warnings
from inspect import signature
from contextlib import suppress
from functools import partial

from math import log
Expand All @@ -33,7 +32,7 @@
from .utils.fixes import delayed
from .utils.validation import check_is_fitted, check_consistent_length
from .utils.validation import _check_sample_weight, _num_samples
from .pipeline import Pipeline
from .utils import _safe_indexing
from .isotonic import IsotonicRegression
from .svm import LinearSVC
from .model_selection import check_cv, cross_val_predict
Expand Down Expand Up @@ -141,6 +140,12 @@ class CalibratedClassifierCV(ClassifierMixin,
classes_ : ndarray of shape (n_classes,)
The class labels.

n_features_in_ : int
Number of features seen during :term:`fit`. Only defined if the
underlying base_estimator exposes such an attribute when fit.

.. versionadded:: 0.24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.. versionadded:: 0.24
.. versionadded:: 1.0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_features_in_ was set in 0.24, (but we did not document it)

Copy link
Member Author

@ogrisel ogrisel Mar 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it was there, I think it's more accurate to document that it was introduced in 0.24, even if not documented.

This can be considered a fix for a bad documentation if you wish.


calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \
or `ensemble=False`)
The list of classifier and calibrator pairs.
Expand Down Expand Up @@ -250,14 +255,8 @@ def fit(self, X, y, sample_weight=None):

self.calibrated_classifiers_ = []
if self.cv == "prefit":
# `classes_` and `n_features_in_` should be consistent with that
# of base_estimator
if isinstance(self.base_estimator, Pipeline):
check_is_fitted(self.base_estimator[-1])
else:
check_is_fitted(self.base_estimator)
with suppress(AttributeError):
self.n_features_in_ = base_estimator.n_features_in_
# `classes_` should be consistent with that of base_estimator
check_is_fitted(self.base_estimator, attributes=["classes_"])
self.classes_ = self.base_estimator.classes_

pred_method = _get_prediction_method(base_estimator)
Expand All @@ -270,10 +269,6 @@ def fit(self, X, y, sample_weight=None):
)
self.calibrated_classifiers_.append(calibrated_classifier)
else:
X, y = self._validate_data(
X, y, accept_sparse=['csc', 'csr', 'coo'],
force_all_finite=False, allow_nd=True
)
Copy link
Member Author

@ogrisel ogrisel Mar 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency I prefer, to never validate the data in the meta estimator and use _safe_indexing in _fit_classifier_calibrator_pair instead. I am not sure if this is right or not.

# Set `classes_` using all `y`
label_encoder_ = LabelEncoder().fit(y)
self.classes_ = label_encoder_.classes_
Expand Down Expand Up @@ -334,6 +329,9 @@ def fit(self, X, y, sample_weight=None):
)
self.calibrated_classifiers_.append(calibrated_classifier)

first_clf = self.calibrated_classifiers_[0].base_estimator
if hasattr(first_clf, "n_features_in_"):
self.n_features_in_ = first_clf.n_features_in_
return self

def predict_proba(self, X):
Expand All @@ -352,7 +350,6 @@ def predict_proba(self, X):
The predicted probas.
"""
check_is_fitted(self)

# Compute the arithmetic mean of the predictions of the calibrated
# classifiers
mean_proba = np.zeros((_num_samples(X), len(self.classes_)))
Expand Down Expand Up @@ -431,19 +428,26 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw,
-------
calibrated_classifier : _CalibratedClassifier instance
"""
if sample_weight is not None and supports_sw:
estimator.fit(X[train], y[train],
sample_weight=sample_weight[train])
X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
if supports_sw and sample_weight is not None:
sw_train = _safe_indexing(sample_weight, train)
sw_test = _safe_indexing(sample_weight, test)
else:
sw_train = None
sw_test = None

if supports_sw:
estimator.fit(X_train, y_train, sample_weight=sw_train)
else:
estimator.fit(X[train], y[train])
estimator.fit(X_train, y_train)

n_classes = len(classes)
pred_method = _get_prediction_method(estimator)
predictions = _compute_predictions(pred_method, X[test], n_classes)
predictions = _compute_predictions(pred_method, X_test, n_classes)

sw = None if sample_weight is None else sample_weight[test]
calibrated_classifier = _fit_calibrator(
estimator, predictions, y[test], classes, method, sample_weight=sw
estimator, predictions, y_test, classes, method, sample_weight=sw_test
)
return calibrated_classifier

Expand Down
37 changes: 26 additions & 11 deletions sklearn/tests/test_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,27 +512,27 @@ def decision_function(self, X):


@pytest.fixture
def text_data():
text_data = [
def dict_data():
dict_data = [
{'state': 'NY', 'age': 'adult'},
{'state': 'TX', 'age': 'adult'},
{'state': 'VT', 'age': 'child'},
]
text_labels = [1, 0, 1]
return text_data, text_labels
return dict_data, text_labels


@pytest.fixture
def text_data_pipeline(text_data):
X, y = text_data
def dict_data_pipeline(dict_data):
X, y = dict_data
pipeline_prefit = Pipeline([
('vectorizer', DictVectorizer()),
('clf', RandomForestClassifier())
])
return pipeline_prefit.fit(X, y)


def test_calibration_pipeline(text_data, text_data_pipeline):
def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
"""Test that calibration works in prefit pipeline with transformer

`X` is not array-like, sparse matrix or dataframe at the start.
Expand All @@ -541,15 +541,17 @@ def test_calibration_pipeline(text_data, text_data_pipeline):
Also test it can predict without running into validation errors.
See https://github.com/scikit-learn/scikit-learn/issues/19637
"""
X, y = text_data
clf = text_data_pipeline
X, y = dict_data
clf = dict_data_pipeline
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomasjpfan note that while working on this, I discovered that _num_features(X) == 2 while DictVectorizer will probably never set n_features_in_ because it is flexible enough to accept a variable number of dict entries.

This is not a problem for this PR because I made CalibratedClassifierCV check _num_features(X) only if base_estimator define the n_features_in_ attribute but I wonder if it does not reveal that our _num_features utility is not trying to be too smart.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you suggest to be the alternative?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #19740 to have _num_features error for a collection of dicts.

calib_clf = CalibratedClassifierCV(clf, cv='prefit')
calib_clf.fit(X, y)
# Check attributes are obtained from fitted estimator
assert_array_equal(calib_clf.classes_, clf.classes_)
msg = "'CalibratedClassifierCV' object has no attribute"
with pytest.raises(AttributeError, match=msg):
calib_clf.n_features_in_

# Neither the pipeline nor the calibration meta-estimator
# expose the n_features_in_ check on this kind of data.
assert not hasattr(clf, 'n_features_in_')
assert not hasattr(calib_clf, 'n_features_in_')

# Ensure that no error is thrown with predict and predict_proba
calib_clf.predict(X)
Expand Down Expand Up @@ -578,6 +580,19 @@ def test_calibration_attributes(clf, cv):
assert calib_clf.n_features_in_ == X.shape[1]


def test_calibration_inconsistent_prefit_n_features_in():
# Check that `n_features_in_` from prefit base estimator
# is consistent with training set
X, y = make_classification(n_samples=10, n_features=5,
n_classes=2, random_state=7)
clf = LinearSVC(C=1).fit(X, y)
calib_clf = CalibratedClassifierCV(clf, cv='prefit')

msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
with pytest.raises(ValueError, match=msg):
calib_clf.fit(X[:, :3], y)


# FIXME: remove in 1.1
def test_calibrated_classifier_cv_deprecation(data):
# Check that we raise the proper deprecation warning if accessing
Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request):
#
# check_classifiers_train would need to be updated with the error message
N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
'calibration',
'compose',
'feature_extraction',
'mixture',
Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_docstring_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV):


N_FEATURES_MODULES_TO_IGNORE = {
'calibration',
'cluster',
'compose',
'covariance',
Expand Down
7 changes: 5 additions & 2 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,10 +959,13 @@ def check_dtype_object(name, estimator_orig):


def check_complex_data(name, estimator_orig):
rng = np.random.RandomState(42)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this change related to this PR? (I'm happy to keep it here anyway)

Copy link
Member Author

@ogrisel ogrisel Mar 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes otherwise it would fail in this PR because the validation is now delegated to the underlying estimator, but after the CV split of CalibratedClassifierCV: this test was failing because the default StratifiedKFold CV split was failing because of the unique values in y that prevented to have balanced "classes" in the validation folds.

# check that estimators raise an exception on providing complex data
X = np.random.sample(10) + 1j * np.random.sample(10)
X = rng.uniform(size=10) + 1j * rng.uniform(size=10)
X = X.reshape(-1, 1)
y = np.random.sample(10) + 1j * np.random.sample(10)

# Something both valid for classification and regression
y = rng.randint(low=0, high=2, size=10) + 1j
estimator = clone(estimator_orig)
with raises(ValueError, match="Complex data not supported"):
estimator.fit(X, y)
Expand Down