-
-
Notifications
You must be signed in to change notification settings - Fork 26.1k
Test and doc for n_features_in_ for sklearn.calibration #19555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2f6de70
24d8e90
4a2ce3f
446aff1
d4f5118
8ccd12c
057a1f9
af0f3a9
1425db0
aab7e7f
cb1548e
c52d2b7
d85441a
e166e0e
1ff64be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -9,7 +9,6 @@ | |||||
|
||||||
import warnings | ||||||
from inspect import signature | ||||||
from contextlib import suppress | ||||||
from functools import partial | ||||||
|
||||||
from math import log | ||||||
|
@@ -33,7 +32,7 @@ | |||||
from .utils.fixes import delayed | ||||||
from .utils.validation import check_is_fitted, check_consistent_length | ||||||
from .utils.validation import _check_sample_weight, _num_samples | ||||||
from .pipeline import Pipeline | ||||||
from .utils import _safe_indexing | ||||||
from .isotonic import IsotonicRegression | ||||||
from .svm import LinearSVC | ||||||
from .model_selection import check_cv, cross_val_predict | ||||||
|
@@ -141,6 +140,12 @@ class CalibratedClassifierCV(ClassifierMixin, | |||||
classes_ : ndarray of shape (n_classes,) | ||||||
The class labels. | ||||||
|
||||||
n_features_in_ : int | ||||||
Number of features seen during :term:`fit`. Only defined if the | ||||||
underlying base_estimator exposes such an attribute when fit. | ||||||
|
||||||
.. versionadded:: 0.24 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since it was there, I think it's more accurate to document that it was introduced in 0.24, even if not documented. This can be considered a fix for a bad documentation if you wish. |
||||||
|
||||||
calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \ | ||||||
or `ensemble=False`) | ||||||
The list of classifier and calibrator pairs. | ||||||
|
@@ -250,14 +255,8 @@ def fit(self, X, y, sample_weight=None): | |||||
|
||||||
self.calibrated_classifiers_ = [] | ||||||
if self.cv == "prefit": | ||||||
# `classes_` and `n_features_in_` should be consistent with that | ||||||
# of base_estimator | ||||||
if isinstance(self.base_estimator, Pipeline): | ||||||
check_is_fitted(self.base_estimator[-1]) | ||||||
else: | ||||||
check_is_fitted(self.base_estimator) | ||||||
with suppress(AttributeError): | ||||||
self.n_features_in_ = base_estimator.n_features_in_ | ||||||
# `classes_` should be consistent with that of base_estimator | ||||||
check_is_fitted(self.base_estimator, attributes=["classes_"]) | ||||||
self.classes_ = self.base_estimator.classes_ | ||||||
|
||||||
pred_method = _get_prediction_method(base_estimator) | ||||||
|
@@ -270,10 +269,6 @@ def fit(self, X, y, sample_weight=None): | |||||
) | ||||||
self.calibrated_classifiers_.append(calibrated_classifier) | ||||||
else: | ||||||
X, y = self._validate_data( | ||||||
X, y, accept_sparse=['csc', 'csr', 'coo'], | ||||||
force_all_finite=False, allow_nd=True | ||||||
) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For consistency I prefer, to never validate the data in the meta estimator and use |
||||||
# Set `classes_` using all `y` | ||||||
label_encoder_ = LabelEncoder().fit(y) | ||||||
self.classes_ = label_encoder_.classes_ | ||||||
|
@@ -334,6 +329,9 @@ def fit(self, X, y, sample_weight=None): | |||||
) | ||||||
self.calibrated_classifiers_.append(calibrated_classifier) | ||||||
|
||||||
first_clf = self.calibrated_classifiers_[0].base_estimator | ||||||
if hasattr(first_clf, "n_features_in_"): | ||||||
self.n_features_in_ = first_clf.n_features_in_ | ||||||
return self | ||||||
|
||||||
def predict_proba(self, X): | ||||||
|
@@ -352,7 +350,6 @@ def predict_proba(self, X): | |||||
The predicted probas. | ||||||
""" | ||||||
check_is_fitted(self) | ||||||
|
||||||
# Compute the arithmetic mean of the predictions of the calibrated | ||||||
# classifiers | ||||||
mean_proba = np.zeros((_num_samples(X), len(self.classes_))) | ||||||
|
@@ -431,19 +428,26 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, | |||||
------- | ||||||
calibrated_classifier : _CalibratedClassifier instance | ||||||
""" | ||||||
if sample_weight is not None and supports_sw: | ||||||
estimator.fit(X[train], y[train], | ||||||
sample_weight=sample_weight[train]) | ||||||
X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train) | ||||||
X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test) | ||||||
if supports_sw and sample_weight is not None: | ||||||
sw_train = _safe_indexing(sample_weight, train) | ||||||
sw_test = _safe_indexing(sample_weight, test) | ||||||
else: | ||||||
sw_train = None | ||||||
sw_test = None | ||||||
|
||||||
if supports_sw: | ||||||
estimator.fit(X_train, y_train, sample_weight=sw_train) | ||||||
else: | ||||||
estimator.fit(X[train], y[train]) | ||||||
estimator.fit(X_train, y_train) | ||||||
|
||||||
n_classes = len(classes) | ||||||
pred_method = _get_prediction_method(estimator) | ||||||
predictions = _compute_predictions(pred_method, X[test], n_classes) | ||||||
predictions = _compute_predictions(pred_method, X_test, n_classes) | ||||||
|
||||||
sw = None if sample_weight is None else sample_weight[test] | ||||||
calibrated_classifier = _fit_calibrator( | ||||||
estimator, predictions, y[test], classes, method, sample_weight=sw | ||||||
estimator, predictions, y_test, classes, method, sample_weight=sw_test | ||||||
) | ||||||
return calibrated_classifier | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -512,27 +512,27 @@ def decision_function(self, X): | |
|
||
|
||
@pytest.fixture | ||
def text_data(): | ||
text_data = [ | ||
def dict_data(): | ||
dict_data = [ | ||
{'state': 'NY', 'age': 'adult'}, | ||
{'state': 'TX', 'age': 'adult'}, | ||
{'state': 'VT', 'age': 'child'}, | ||
] | ||
text_labels = [1, 0, 1] | ||
return text_data, text_labels | ||
return dict_data, text_labels | ||
|
||
|
||
@pytest.fixture | ||
def text_data_pipeline(text_data): | ||
X, y = text_data | ||
def dict_data_pipeline(dict_data): | ||
X, y = dict_data | ||
pipeline_prefit = Pipeline([ | ||
('vectorizer', DictVectorizer()), | ||
('clf', RandomForestClassifier()) | ||
]) | ||
return pipeline_prefit.fit(X, y) | ||
|
||
|
||
def test_calibration_pipeline(text_data, text_data_pipeline): | ||
def test_calibration_dict_pipeline(dict_data, dict_data_pipeline): | ||
"""Test that calibration works in prefit pipeline with transformer | ||
|
||
`X` is not array-like, sparse matrix or dataframe at the start. | ||
|
@@ -541,15 +541,17 @@ def test_calibration_pipeline(text_data, text_data_pipeline): | |
Also test it can predict without running into validation errors. | ||
See https://github.com/scikit-learn/scikit-learn/issues/19637 | ||
""" | ||
X, y = text_data | ||
clf = text_data_pipeline | ||
X, y = dict_data | ||
clf = dict_data_pipeline | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @thomasjpfan note that while working on this, I discovered that This is not a problem for this PR because I made There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you suggest to be the alternative? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened #19740 to have |
||
calib_clf = CalibratedClassifierCV(clf, cv='prefit') | ||
calib_clf.fit(X, y) | ||
# Check attributes are obtained from fitted estimator | ||
assert_array_equal(calib_clf.classes_, clf.classes_) | ||
msg = "'CalibratedClassifierCV' object has no attribute" | ||
with pytest.raises(AttributeError, match=msg): | ||
calib_clf.n_features_in_ | ||
|
||
# Neither the pipeline nor the calibration meta-estimator | ||
# expose the n_features_in_ check on this kind of data. | ||
assert not hasattr(clf, 'n_features_in_') | ||
assert not hasattr(calib_clf, 'n_features_in_') | ||
|
||
# Ensure that no error is thrown with predict and predict_proba | ||
calib_clf.predict(X) | ||
|
@@ -578,6 +580,19 @@ def test_calibration_attributes(clf, cv): | |
assert calib_clf.n_features_in_ == X.shape[1] | ||
|
||
|
||
def test_calibration_inconsistent_prefit_n_features_in(): | ||
# Check that `n_features_in_` from prefit base estimator | ||
# is consistent with training set | ||
X, y = make_classification(n_samples=10, n_features=5, | ||
n_classes=2, random_state=7) | ||
clf = LinearSVC(C=1).fit(X, y) | ||
calib_clf = CalibratedClassifierCV(clf, cv='prefit') | ||
|
||
msg = "X has 3 features, but LinearSVC is expecting 5 features as input." | ||
with pytest.raises(ValueError, match=msg): | ||
calib_clf.fit(X[:, :3], y) | ||
|
||
|
||
# FIXME: remove in 1.1 | ||
def test_calibrated_classifier_cv_deprecation(data): | ||
# Check that we raise the proper deprecation warning if accessing | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -959,10 +959,13 @@ def check_dtype_object(name, estimator_orig): | |
|
||
|
||
def check_complex_data(name, estimator_orig): | ||
rng = np.random.RandomState(42) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this change related to this PR? (I'm happy to keep it here anyway) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes otherwise it would fail in this PR because the validation is now delegated to the underlying estimator, but after the CV split of |
||
# check that estimators raise an exception on providing complex data | ||
X = np.random.sample(10) + 1j * np.random.sample(10) | ||
X = rng.uniform(size=10) + 1j * rng.uniform(size=10) | ||
X = X.reshape(-1, 1) | ||
y = np.random.sample(10) + 1j * np.random.sample(10) | ||
|
||
# Something both valid for classification and regression | ||
y = rng.randint(low=0, high=2, size=10) + 1j | ||
estimator = clone(estimator_orig) | ||
with raises(ValueError, match="Complex data not supported"): | ||
estimator.fit(X, y) | ||
|
Uh oh!
There was an error while loading. Please reload this page.