scikit-learn · ogrisel · Mar 30, 2021 · Feb 25, 2021 · Mar 17, 2021 · Mar 17, 2021
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -9,7 +9,6 @@
 
 import warnings
 from inspect import signature
-from contextlib import suppress
 from functools import partial
 
 from math import log
@@ -33,7 +32,7 @@
 from .utils.fixes import delayed
 from .utils.validation import check_is_fitted, check_consistent_length
 from .utils.validation import _check_sample_weight, _num_samples
-from .pipeline import Pipeline
+from .utils import _safe_indexing
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv, cross_val_predict
@@ -141,6 +140,12 @@ class CalibratedClassifierCV(ClassifierMixin,
     classes_ : ndarray of shape (n_classes,)
         The class labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying base_estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
-        .. versionadded:: 0.24
+        .. versionadded:: 1.0
-        .. versionadded:: 0.24
+        .. versionadded:: 1.0
+
     calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \
             or `ensemble=False`)
         The list of classifier and calibrator pairs.
@@ -250,14 +255,8 @@ def fit(self, X, y, sample_weight=None):
 
         self.calibrated_classifiers_ = []
         if self.cv == "prefit":
-            # `classes_` and `n_features_in_` should be consistent with that
-            # of base_estimator
-            if isinstance(self.base_estimator, Pipeline):
-                check_is_fitted(self.base_estimator[-1])
-            else:
-                check_is_fitted(self.base_estimator)
-            with suppress(AttributeError):
-                self.n_features_in_ = base_estimator.n_features_in_
+            # `classes_` should be consistent with that of base_estimator
+            check_is_fitted(self.base_estimator, attributes=["classes_"])
             self.classes_ = self.base_estimator.classes_
 
             pred_method = _get_prediction_method(base_estimator)
@@ -270,10 +269,6 @@ def fit(self, X, y, sample_weight=None):
             )
             self.calibrated_classifiers_.append(calibrated_classifier)
         else:
-            X, y = self._validate_data(
-                X, y, accept_sparse=['csc', 'csr', 'coo'],
-                force_all_finite=False, allow_nd=True
-            )
             # Set `classes_` using all `y`
             label_encoder_ = LabelEncoder().fit(y)
             self.classes_ = label_encoder_.classes_
@@ -334,6 +329,9 @@ def fit(self, X, y, sample_weight=None):
                 )
                 self.calibrated_classifiers_.append(calibrated_classifier)
 
+        first_clf = self.calibrated_classifiers_[0].base_estimator
+        if hasattr(first_clf, "n_features_in_"):
+            self.n_features_in_ = first_clf.n_features_in_
         return self
 
     def predict_proba(self, X):
@@ -352,7 +350,6 @@ def predict_proba(self, X):
             The predicted probas.
         """
         check_is_fitted(self)
-
         # Compute the arithmetic mean of the predictions of the calibrated
         # classifiers
         mean_proba = np.zeros((_num_samples(X), len(self.classes_)))
@@ -431,19 +428,26 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw,
     -------
     calibrated_classifier : _CalibratedClassifier instance
     """
-    if sample_weight is not None and supports_sw:
-        estimator.fit(X[train], y[train],
-                      sample_weight=sample_weight[train])
+    X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
+    X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
+    if supports_sw and sample_weight is not None:
+        sw_train = _safe_indexing(sample_weight, train)
+        sw_test = _safe_indexing(sample_weight, test)
+    else:
+        sw_train = None
+        sw_test = None
+
+    if supports_sw:
+        estimator.fit(X_train, y_train, sample_weight=sw_train)
     else:
-        estimator.fit(X[train], y[train])
+        estimator.fit(X_train, y_train)
 
     n_classes = len(classes)
     pred_method = _get_prediction_method(estimator)
-    predictions = _compute_predictions(pred_method, X[test], n_classes)
+    predictions = _compute_predictions(pred_method, X_test, n_classes)
 
-    sw = None if sample_weight is None else sample_weight[test]
     calibrated_classifier = _fit_calibrator(
-        estimator, predictions, y[test], classes, method, sample_weight=sw
+        estimator, predictions, y_test, classes, method, sample_weight=sw_test
     )
     return calibrated_classifier
 

diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
@@ -512,27 +512,27 @@ def decision_function(self, X):
 
 
 @pytest.fixture
-def text_data():
-    text_data = [
+def dict_data():
+    dict_data = [
         {'state': 'NY', 'age': 'adult'},
         {'state': 'TX', 'age': 'adult'},
         {'state': 'VT', 'age': 'child'},
     ]
     text_labels = [1, 0, 1]
-    return text_data, text_labels
+    return dict_data, text_labels
 
 
 @pytest.fixture
-def text_data_pipeline(text_data):
-    X, y = text_data
+def dict_data_pipeline(dict_data):
+    X, y = dict_data
     pipeline_prefit = Pipeline([
         ('vectorizer', DictVectorizer()),
         ('clf', RandomForestClassifier())
     ])
     return pipeline_prefit.fit(X, y)
 
 
-def test_calibration_pipeline(text_data, text_data_pipeline):
+def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
     """Test that calibration works in prefit pipeline with transformer
 
     `X` is not array-like, sparse matrix or dataframe at the start.
@@ -541,15 +541,17 @@ def test_calibration_pipeline(text_data, text_data_pipeline):
     Also test it can predict without running into validation errors.
     See https://github.com/scikit-learn/scikit-learn/issues/19637
     """
-    X, y = text_data
-    clf = text_data_pipeline
+    X, y = dict_data
+    clf = dict_data_pipeline
     calib_clf = CalibratedClassifierCV(clf, cv='prefit')
     calib_clf.fit(X, y)
     # Check attributes are obtained from fitted estimator
     assert_array_equal(calib_clf.classes_, clf.classes_)
-    msg = "'CalibratedClassifierCV' object has no attribute"
-    with pytest.raises(AttributeError, match=msg):
-        calib_clf.n_features_in_
+
+    # Neither the pipeline nor the calibration meta-estimator
+    # expose the n_features_in_ check on this kind of data.
+    assert not hasattr(clf, 'n_features_in_')
+    assert not hasattr(calib_clf, 'n_features_in_')
 
     # Ensure that no error is thrown with predict and predict_proba
     calib_clf.predict(X)
@@ -578,6 +580,19 @@ def test_calibration_attributes(clf, cv):
         assert calib_clf.n_features_in_ == X.shape[1]
 
 
+def test_calibration_inconsistent_prefit_n_features_in():
+    # Check that `n_features_in_` from prefit base estimator
+    # is consistent with training set
+    X, y = make_classification(n_samples=10, n_features=5,
+                               n_classes=2, random_state=7)
+    clf = LinearSVC(C=1).fit(X, y)
+    calib_clf = CalibratedClassifierCV(clf, cv='prefit')
+
+    msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
+    with pytest.raises(ValueError, match=msg):
+        calib_clf.fit(X[:, :3], y)
+
+
 # FIXME: remove in 1.1
 def test_calibrated_classifier_cv_deprecation(data):
     # Check that we raise the proper deprecation warning if accessing

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request):
 #
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
-    'calibration',
     'compose',
     'feature_extraction',
     'mixture',

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
@@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV):
 
 
 N_FEATURES_MODULES_TO_IGNORE = {
-    'calibration',
     'cluster',
     'compose',
     'covariance',

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -959,10 +959,13 @@ def check_dtype_object(name, estimator_orig):
 
 
 def check_complex_data(name, estimator_orig):
+    rng = np.random.RandomState(42)
     # check that estimators raise an exception on providing complex data
-    X = np.random.sample(10) + 1j * np.random.sample(10)
+    X = rng.uniform(size=10) + 1j * rng.uniform(size=10)
     X = X.reshape(-1, 1)
-    y = np.random.sample(10) + 1j * np.random.sample(10)
+
+    # Something both valid for classification and regression
+    y = rng.randint(low=0, high=2, size=10) + 1j
     estimator = clone(estimator_orig)
     with raises(ValueError, match="Complex data not supported"):
         estimator.fit(X, y)