scikit-learn · DSLituiev · Feb 13, 2017 · Feb 13, 2017 · Feb 14, 2017 · Feb 15, 2017
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
@@ -8,6 +8,8 @@
 from .univariate_selection import f_classif
 from .univariate_selection import f_oneway
 from .univariate_selection import f_regression
+from .univariate_selection import r_regression
+from .univariate_selection import abs_r_regression
 from .univariate_selection import SelectPercentile
 from .univariate_selection import SelectKBest
 from .univariate_selection import SelectFpr
@@ -39,5 +41,7 @@
            'f_classif',
            'f_oneway',
            'f_regression',
+           'r_regression',
+           'abs_r_regression',
            'mutual_info_classif',
            'mutual_info_regression']
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -26,9 +26,10 @@
 from sklearn.datasets.samples_generator import (make_classification,
                                                 make_regression)
 from sklearn.feature_selection import (
-    chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
-    mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
-    SelectFdr, SelectFwe, GenericUnivariateSelect)
+    chi2, f_classif, f_oneway, f_regression, abs_r_regression,
+    mutual_info_classif, mutual_info_regression, SelectPercentile,
+    SelectKBest, SelectFpr, SelectFdr, SelectFwe,
+    GenericUnivariateSelect)
 
 
 ##############################################################################
@@ -79,6 +80,28 @@ def test_f_classif():
     assert_array_almost_equal(pv_sparse, pv)
 
 
+def test_abs_r_regression():
+    # Test whether the F test yields meaningful results
+    # on a simple simulated regression problem
+    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
+                           shuffle=False, random_state=0)
+
+    abs_pearson_r = abs_r_regression(X, y)
+    assert_true((abs_pearson_r < 1).all())
+    assert_true((abs_pearson_r[:5] > 0.1).all())
+    assert_true((abs_pearson_r[5:] < 0.2).all())
+
+    # with centering, compare with sparse
+    abs_pearson_r = f_regression(X, y, center=True)
+    abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
+    assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r)
+
+    # again without centering, compare with sparse
+    abs_pearson_r = f_regression(X, y, center=False)
+    abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
+    assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r)
+
+
 def test_f_regression():
     # Test whether the F test yields meaningful results
     # on a simple simulated regression problem
@@ -357,6 +380,25 @@ def test_select_kbest_regression():
     assert_array_equal(support, gtruth)
 
 
+def test_select_kbest_abs_r_regression():
+    # Test whether the relative univariate feature selection
+    # gets the correct items in a simple regression problem
+    # with the k best heuristic
+    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
+                           shuffle=False, random_state=0, noise=10)
+
+    univariate_filter = SelectKBest(abs_r_regression, k=5)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = GenericUnivariateSelect(
+        f_regression, mode='k_best', param=5).fit(X, y).transform(X)
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(20)
+    gtruth[:5] = 1
+    assert_array_equal(support, gtruth)
+
+
 def test_select_heuristics_regression():
     # Test whether the relative univariate feature selection
     # gets the correct items in a simple regression problem

diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
@@ -227,19 +227,16 @@ def chi2(X, y):
     return _chisquare(observed, expected)
 
 
-def f_regression(X, y, center=True):
-    """Univariate linear regression tests.
+def r_regression(X, y, center=True):
+    """Univariate linear regression tests returning Pearson R.
 
     Linear model for testing the individual effect of each of many regressors.
     This is a scoring function to be used in a feature selection procedure, not
     a free standing feature selection procedure.
 
-    This is done in 2 steps:
-
-    1. The correlation between each regressor and the target is computed,
-       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
+    The cross correlation between each regressor and the target is computed,
+    that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
        std(y)).
-    2. It is converted to an F score then to a p-value.
 
     For more on usage see the :ref:`User Guide <univariate_feature_selection>`.
 
@@ -256,15 +253,14 @@ def f_regression(X, y, center=True):
 
     Returns
     -------
-    F : array, shape=(n_features,)
-        F values of features.
-
-    pval : array, shape=(n_features,)
-        p-values of F-scores.
+    corr : array, shape=(n_features,)
+        Pearson R correlation coefficients of features.
 
 
     See also
     --------
+    f_regression: Univariate linear regression tests returning f-statistic
+        and p-values
     mutual_info_regression: Mutual information for a continuous target.
     f_classif: ANOVA F-value between label/feature for classification tasks.
     chi2: Chi-squared stats of non-negative features for classification tasks.
@@ -297,14 +293,72 @@ def f_regression(X, y, center=True):
     corr = safe_sparse_dot(y, X)
     corr /= X_norms
     corr /= np.linalg.norm(y)
+    return corr
 
-    # convert to p-value
+
+def f_regression(X, y, center=True):
+    """Univariate linear regression tests returning F-statistic and p-values.
+
+    Quick linear model for testing the effect of a single regressor,
+    sequentially for many regressors.
+
+    This is done in 2 steps:
+
+    1. The cross correlation between each regressor and the target is computed,
+       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
+       std(y)) using r_regression function.
+    2. It is converted to an F score and then to a p-value.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
+        The set of regressors that will be tested sequentially.
+
+    y : array of shape(n_samples).
+        The data matrix
+
+    center : True, bool,
+        If true, X and y will be centered.
+
+    Returns
+    -------
+    F : array, shape=(n_features,)
+        F values of features.
+
+    pval : array, shape=(n_features,)
+        p-values of F-scores.
+
+    See also
+    --------
+    r_regression: Univariate linear regression tests returning Pearson R.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    """
+
+    # compute the correlation
+    corr = r_regression(X, y, center=center)
     degrees_of_freedom = y.size - (2 if center else 1)
+    # convert to p-value
     F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
     pv = stats.f.sf(F, 1, degrees_of_freedom)
     return F, pv
 
 
+def abs_r_regression(X, y, center=True):
+    """Absolute value of Pearson R from univariate linear regressions.
+
+    This convenience wrapper is to be used with SelectKBest and other models
+    that require a statistic which is increases with significance of
+    association.
+
+    see r_regression for details.
+    """
+    # compute the correlation
+    corr = r_regression(X, y, center=center)
+    return abs(corr)
+
 ######################################################################
 # Base classes
 
@@ -472,6 +526,8 @@ class SelectKBest(_BaseFilter):
     f_classif: ANOVA F-value between label/feature for classification tasks.
     mutual_info_classif: Mutual information for a discrete target.
     chi2: Chi-squared stats of non-negative features for classification tasks.
+    abs_r_regression: absolute value of Pearson R between label/feature for
+        regression tasks.
     f_regression: F-value between label/feature for regression tasks.
     mutual_info_regression: Mutual information for a continuous target.
     SelectPercentile: Select features based on percentile of the highest scores.