Skip to content

r_regression and abs_r_regression added #8353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sklearn/feature_selection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from .univariate_selection import f_classif
from .univariate_selection import f_oneway
from .univariate_selection import f_regression
from .univariate_selection import r_regression
from .univariate_selection import abs_r_regression
from .univariate_selection import SelectPercentile
from .univariate_selection import SelectKBest
from .univariate_selection import SelectFpr
Expand Down Expand Up @@ -39,5 +41,7 @@
'f_classif',
'f_oneway',
'f_regression',
'r_regression',
'abs_r_regression',
'mutual_info_classif',
'mutual_info_regression']
48 changes: 45 additions & 3 deletions sklearn/feature_selection/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
from sklearn.datasets.samples_generator import (make_classification,
make_regression)
from sklearn.feature_selection import (
chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
SelectFdr, SelectFwe, GenericUnivariateSelect)
chi2, f_classif, f_oneway, f_regression, abs_r_regression,
mutual_info_classif, mutual_info_regression, SelectPercentile,
SelectKBest, SelectFpr, SelectFdr, SelectFwe,
GenericUnivariateSelect)


##############################################################################
Expand Down Expand Up @@ -79,6 +80,28 @@ def test_f_classif():
assert_array_almost_equal(pv_sparse, pv)


def test_abs_r_regression():
# Test whether the F test yields meaningful results
# on a simple simulated regression problem
X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
shuffle=False, random_state=0)

abs_pearson_r = abs_r_regression(X, y)
assert_true((abs_pearson_r < 1).all())
assert_true((abs_pearson_r[:5] > 0.1).all())
assert_true((abs_pearson_r[5:] < 0.2).all())

# with centering, compare with sparse
abs_pearson_r = f_regression(X, y, center=True)
abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r)

# again without centering, compare with sparse
abs_pearson_r = f_regression(X, y, center=False)
abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r)


def test_f_regression():
# Test whether the F test yields meaningful results
# on a simple simulated regression problem
Expand Down Expand Up @@ -357,6 +380,25 @@ def test_select_kbest_regression():
assert_array_equal(support, gtruth)


def test_select_kbest_abs_r_regression():
# Test whether the relative univariate feature selection
# gets the correct items in a simple regression problem
# with the k best heuristic
X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
shuffle=False, random_state=0, noise=10)

univariate_filter = SelectKBest(abs_r_regression, k=5)
X_r = univariate_filter.fit(X, y).transform(X)
assert_best_scores_kept(univariate_filter)
X_r2 = GenericUnivariateSelect(
f_regression, mode='k_best', param=5).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)


def test_select_heuristics_regression():
# Test whether the relative univariate feature selection
# gets the correct items in a simple regression problem
Expand Down
82 changes: 69 additions & 13 deletions sklearn/feature_selection/univariate_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,19 +227,16 @@ def chi2(X, y):
return _chisquare(observed, expected)


def f_regression(X, y, center=True):
"""Univariate linear regression tests.
def r_regression(X, y, center=True):
"""Univariate linear regression tests returning Pearson R.

Linear model for testing the individual effect of each of many regressors.
This is a scoring function to be used in a feature selection procedure, not
a free standing feature selection procedure.

This is done in 2 steps:

1. The correlation between each regressor and the target is computed,
that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
The cross correlation between each regressor and the target is computed,
that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
std(y)).
2. It is converted to an F score then to a p-value.

For more on usage see the :ref:`User Guide <univariate_feature_selection>`.

Expand All @@ -256,15 +253,14 @@ def f_regression(X, y, center=True):

Returns
-------
F : array, shape=(n_features,)
F values of features.

pval : array, shape=(n_features,)
p-values of F-scores.
corr : array, shape=(n_features,)
Pearson R correlation coefficients of features.


See also
--------
f_regression: Univariate linear regression tests returning f-statistic
and p-values
mutual_info_regression: Mutual information for a continuous target.
f_classif: ANOVA F-value between label/feature for classification tasks.
chi2: Chi-squared stats of non-negative features for classification tasks.
Expand Down Expand Up @@ -297,14 +293,72 @@ def f_regression(X, y, center=True):
corr = safe_sparse_dot(y, X)
corr /= X_norms
corr /= np.linalg.norm(y)
return corr

# convert to p-value

def f_regression(X, y, center=True):
"""Univariate linear regression tests returning F-statistic and p-values.

Quick linear model for testing the effect of a single regressor,
sequentially for many regressors.

This is done in 2 steps:

1. The cross correlation between each regressor and the target is computed,
that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
std(y)) using r_regression function.
2. It is converted to an F score and then to a p-value.

Read more in the :ref:`User Guide <univariate_feature_selection>`.

Parameters
----------
X : {array-like, sparse matrix} shape = (n_samples, n_features)
The set of regressors that will be tested sequentially.

y : array of shape(n_samples).
The data matrix

center : True, bool,
If true, X and y will be centered.

Returns
-------
F : array, shape=(n_features,)
F values of features.

pval : array, shape=(n_features,)
p-values of F-scores.

See also
--------
r_regression: Univariate linear regression tests returning Pearson R.
f_classif: ANOVA F-value between label/feature for classification tasks.
chi2: Chi-squared stats of non-negative features for classification tasks.
"""

# compute the correlation
corr = r_regression(X, y, center=center)
degrees_of_freedom = y.size - (2 if center else 1)
# convert to p-value
F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
pv = stats.f.sf(F, 1, degrees_of_freedom)
return F, pv


def abs_r_regression(X, y, center=True):
"""Absolute value of Pearson R from univariate linear regressions.

This convenience wrapper is to be used with SelectKBest and other models
that require a statistic which is increases with significance of
association.

see r_regression for details.
"""
# compute the correlation
corr = r_regression(X, y, center=center)
return abs(corr)

######################################################################
# Base classes

Expand Down Expand Up @@ -472,6 +526,8 @@ class SelectKBest(_BaseFilter):
f_classif: ANOVA F-value between label/feature for classification tasks.
mutual_info_classif: Mutual information for a discrete target.
chi2: Chi-squared stats of non-negative features for classification tasks.
abs_r_regression: absolute value of Pearson R between label/feature for
regression tasks.
f_regression: F-value between label/feature for regression tasks.
mutual_info_regression: Mutual information for a continuous target.
SelectPercentile: Select features based on percentile of the highest scores.
Expand Down