Skip to content

Commit bc7cd31

Browse files
authored
ENH more efficient _num_combinations calculation in PolynomialFeatures (#19734)
1 parent 108dd7b commit bc7cd31

File tree

3 files changed

+54
-6
lines changed

3 files changed

+54
-6
lines changed

doc/whats_new/v1.0.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,10 @@ Changelog
200200
:pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
201201
:user:`Maria Telenczuk <maikia>`.
202202

203+
- |Efficiency| The implementation of `fit` for `PolynomialFeatures` transformer
204+
is now faster. This is especially noticeable on large sparse input.
205+
:pr:`19734` by :user:`Fred Robinson <frrad>`.
206+
203207
:mod:`sklearn.manifold`
204208
.......................
205209

sklearn/preprocessing/_polynomial.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99
from scipy import sparse
1010
from scipy.interpolate import BSpline
11+
from scipy.special import comb
1112

1213
from ..base import BaseEstimator, TransformerMixin
1314
from ..utils import check_array
@@ -113,6 +114,29 @@ def _combinations(n_features, degree, interaction_only, include_bias):
113114
return chain.from_iterable(comb(range(n_features), i)
114115
for i in range(start, degree + 1))
115116

117+
@staticmethod
118+
def _num_combinations(n_features, degree, interaction_only, include_bias):
119+
"""Calculate number of terms in polynomial expansion
120+
121+
This should be equivalent to counting the number of terms returned by
122+
_combinations(...) but much faster.
123+
"""
124+
125+
if interaction_only:
126+
combinations = sum(
127+
[
128+
comb(n_features, i, exact=True)
129+
for i in range(1, min(degree + 1, n_features + 1))
130+
]
131+
)
132+
else:
133+
combinations = comb(n_features + degree, degree, exact=True) - 1
134+
135+
if include_bias:
136+
combinations += 1
137+
138+
return combinations
139+
116140
@property
117141
def powers_(self):
118142
check_is_fitted(self)
@@ -170,13 +194,12 @@ def fit(self, X, y=None):
170194
self : object
171195
Fitted transformer.
172196
"""
173-
n_samples, n_features = self._validate_data(
174-
X, accept_sparse=True).shape
175-
combinations = self._combinations(n_features, self.degree,
176-
self.interaction_only,
177-
self.include_bias)
197+
_, n_features = self._validate_data(X, accept_sparse=True).shape
178198
self.n_input_features_ = n_features
179-
self.n_output_features_ = sum(1 for _ in combinations)
199+
self.n_output_features_ = self._num_combinations(
200+
n_features, self.degree, self.interaction_only, self.include_bias
201+
)
202+
180203
return self
181204

182205
def transform(self, X):

sklearn/preprocessing/tests/test_polynomial.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,27 @@ def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
552552
assert_array_almost_equal(Xt_csr.A, Xt_dense)
553553

554554

555+
@pytest.mark.parametrize("n_features", [1, 4, 5])
556+
@pytest.mark.parametrize("degree", range(1, 5))
557+
@pytest.mark.parametrize("interaction_only", [True, False])
558+
@pytest.mark.parametrize("include_bias", [True, False])
559+
def test_num_combinations(n_features, degree, interaction_only, include_bias):
560+
"""
561+
Test that n_output_features_ is calculated correctly.
562+
"""
563+
x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
564+
est = PolynomialFeatures(
565+
degree, interaction_only=interaction_only, include_bias=include_bias
566+
)
567+
est.fit(x)
568+
num_combos = est.n_output_features_
569+
570+
combos = PolynomialFeatures._combinations(
571+
n_features, degree, interaction_only, include_bias
572+
)
573+
assert num_combos == sum([1 for _ in combos])
574+
575+
555576
@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
556577
[(2, True, False, np.float32),
557578
(2, True, False, np.float64),

0 commit comments

Comments
 (0)