scikit-learn · ogrisel · Mar 19, 2021 · Jan 31, 2021 · Jan 31, 2021 · Jan 31, 2021
diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
@@ -110,7 +110,7 @@ def one_run(n_samples):
     else:
         # regression
         if loss == 'default':
-            loss = 'least_squares'
+            loss = 'squared_error'
     est.set_params(loss=loss)
     est.fit(X_train, y_train, sample_weight=sample_weight_train)
     sklearn_fit_duration = time() - tic

diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py
@@ -112,7 +112,7 @@ def get_estimator_and_data():
 else:
     # regression
     if loss == 'default':
-        loss = 'least_squares'
+        loss = 'squared_error'
 sklearn_est.set_params(loss=loss)
 
 

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -537,7 +537,8 @@ Regression
 :class:`GradientBoostingRegressor` supports a number of
 :ref:`different loss functions <gradient_boosting_loss>`
 for regression which can be specified via the argument
-``loss``; the default loss function for regression is least squares (``'ls'``).
+``loss``; the default loss function for regression is squared error
+(``'squared_error'``).
 
 ::
 
@@ -549,8 +550,10 @@ for regression which can be specified via the argument
     >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
     >>> X_train, X_test = X[:200], X[200:]
     >>> y_train, y_test = y[:200], y[200:]
-    >>> est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
-    ...     max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
+    >>> est = GradientBoostingRegressor(
+    ...     n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0,
+    ...     loss='squared_error'
+    ... ).fit(X_train, y_train)
     >>> mean_squared_error(y_test, est.predict(X_test))
     5.00...
 
@@ -741,8 +744,8 @@ the parameter ``loss``:
 
   * Regression
 
-    * Least squares (``'ls'``): The natural choice for regression due
-      to its superior computational properties. The initial model is
+    * Squared error (``'squared_error'``): The natural choice for regression
+      due to its superior computational properties. The initial model is
       given by the mean of the target values.
     * Least absolute deviation (``'lad'``): A robust loss function for
       regression. The initial model is given by the median of the
@@ -950,7 +953,7 @@ controls the number of iterations of the boosting process::
   >>> clf.score(X_test, y_test)
   0.8965
 
-Available losses for regression are 'least_squares',
+Available losses for regression are 'squared_error',
 'least_absolute_deviation', which is less sensitive to outliers, and
 'poisson', which is well suited to model counts and frequencies. For
 classification, 'binary_crossentropy' is used for binary classification and

diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
@@ -30,7 +30,7 @@ For example, using `SGDClassifier(loss='log')` results in logistic regression,
 i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression`
 which is fitted via SGD instead of being fitted by one of the other solvers
 in :class:`~sklearn.linear_model.LogisticRegression`. Similarly,
-`SGDRegressor(loss='squared_loss', penalty='l2')` and
+`SGDRegressor(loss='squared_error', penalty='l2')` and
 :class:`~sklearn.linear_model.Ridge` solve the same optimization problem, via
 different means.
 
@@ -211,7 +211,7 @@ samples (> 10.000), for other problems we recommend :class:`Ridge`,
 The concrete loss function can be set via the ``loss``
 parameter. :class:`SGDRegressor` supports the following loss functions:
 
-  * ``loss="squared_loss"``: Ordinary least squares,
+  * ``loss="squared_error"``: Ordinary least squares,
   * ``loss="huber"``: Huber loss for robust regression,
   * ``loss="epsilon_insensitive"``: linear Support Vector Regression.
 
@@ -362,9 +362,9 @@ Different choices for :math:`L` entail different classifiers or regressors:
 
 - Hinge (soft-margin): equivalent to Support Vector Classification.
   :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`.
-- Perceptron: 
+- Perceptron:
   :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`.
-- Modified Huber: 
+- Modified Huber:
   :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) >
   1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise.
 - Log: equivalent to Logistic Regression.

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -45,6 +45,37 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+- |API| The option for using the squared error via ``loss`` and
+  ``criterion`` parameters was made more consistent. The preferred way is by
+  setting the value to `"squared_error"`. Old option names are still valid,
+  produce the same models, but are deprecated and will be removed in version
+  1.2.
+  :pr:`19310` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.GradientBoostingRegressor`, `loss="ls"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.RandomForestRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.HistGradientBoostingRegressor`, `loss="least_squares"`
+    is deprecated, use `"squared_error"` instead which is now the default.
+
+  - For :class:`linear_model.RANSACRegressor`, `loss="squared_loss"` is
+    deprecated, use `"squared_error"` instead.
+
+  - For :class:`linear_model.SGDRegressor`, `loss="squared_loss"` is
+    deprecated, use `"squared_error"` instead which is now the default.
+
+  - For :class:`tree.DecisionTreeRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
 :mod:`sklearn.cluster`
 ......................
 

diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
@@ -177,7 +177,7 @@ def _count_nonzero_coefficients(estimator):
      'prediction_performance_label': 'MSE',
      'n_samples': 30},
     {'estimator': GradientBoostingRegressor,
-     'tuned_params': {'loss': 'ls'},
+     'tuned_params': {'loss': 'squared_error'},
      'changing_param': 'n_estimators',
      'changing_param_values': [10, 50, 100, 200, 500],
      'complexity_label': 'n_trees',

diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -71,24 +71,24 @@ def f(x):
     all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
 # %%
-# For the sake of comparison, also fit a baseline model trained with the usual
-# least squares loss (ls), also known as the mean squared error (MSE).
-gbr_ls = GradientBoostingRegressor(loss='ls', **common_params)
-all_models["ls"] = gbr_ls.fit(X_train, y_train)
+# For the sake of comparison, we also fit a baseline model trained with the
+# usual (mean) squared error (MSE).
+gbr_ls = GradientBoostingRegressor(loss='squared_error', **common_params)
+all_models["mse"] = gbr_ls.fit(X_train, y_train)
 
 # %%
 # Create an evenly spaced evaluation set of input values spanning the [0, 10]
 # range.
 xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
 
 # %%
-# Plot the true conditional mean function f, the prediction of the conditional
-# mean (least squares loss), the conditional median and the conditional 90%
-# interval (from 5th to 95th conditional percentiles).
+# Plot the true conditional mean function f, the predictions of the conditional
+# mean (loss equals squared error), the conditional median and the conditional
+# 90% interval (from 5th to 95th conditional percentiles).
 import matplotlib.pyplot as plt
 
 
-y_pred = all_models['ls'].predict(xx)
+y_pred = all_models['mse'].predict(xx)
 y_lower = all_models['q 0.05'].predict(xx)
 y_upper = all_models['q 0.95'].predict(xx)
 y_med = all_models['q 0.50'].predict(xx)
@@ -153,7 +153,7 @@ def highlight_min(x):
 #
 # Note that because the target distribution is asymmetric, the expected
 # conditional mean and conditional median are signficiantly different and
-# therefore one could not use the least squares model get a good estimation of
+# therefore one could not use the squared error model get a good estimation of
 # the conditional median nor the converse.
 #
 # If the target distribution were symmetric and had no outliers (e.g. with a
@@ -179,9 +179,9 @@ def highlight_min(x):
 # shows that the best test metric is obtained when the model is trained by
 # minimizing this same metric.
 #
-# Note that the conditional median estimator is competitive with the least
-# squares estimator in terms of MSE on the test set: this can be explained by
-# the fact the least squares estimator is very sensitive to large outliers
+# Note that the conditional median estimator is competitive with the squared
+# error estimator in terms of MSE on the test set: this can be explained by
+# the fact the squared error estimator is very sensitive to large outliers
 # which can cause significant overfitting. This can be seen on the right hand
 # side of the previous plot. The conditional median estimator is biased
 # (underestimation for this asymetric noise) but is also naturally robust to

diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -67,7 +67,7 @@
           'max_depth': 4,
           'min_samples_split': 5,
           'learning_rate': 0.01,
-          'loss': 'ls'}
+          'loss': 'squared_error'}
 
 # %%
 # Fit regression model

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
@@ -15,6 +15,7 @@
 from ..base import is_classifier, is_regressor
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
+from ..tree import DecisionTreeRegressor, ExtraTreeRegressor
 from ..utils import Bunch, _print_elapsed_time
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
@@ -151,6 +152,15 @@ def _make_estimator(self, append=True, random_state=None):
         estimator.set_params(**{p: getattr(self, p)
                                 for p in self.estimator_params})
 
+        # TODO: Remove in v1.2
+        # criterion "mse" would cause warnings in every call to
+        # DecisionTreeRegressor.fit(..)
+        if (
+            isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor))
+            and getattr(estimator, "criterion", None) == "mse"
+        ):
+            estimator.set_params(criterion="squared_error")
+
         if random_state is not None:
             _set_random_states(estimator, random_state)
 

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -345,6 +345,17 @@ def fit(self, X, y, sample_weight=None):
 
         # Check parameters
         self._validate_estimator()
+        # TODO: Remove in v1.2
+        if (
+            isinstance(self, (RandomForestRegressor, ExtraTreesRegressor))
+            and self.criterion == "mse"
+        ):
+            warn(
+                "Criterion 'mse' was deprecated in v1.0 and will be "
+                "removed in version 1.2. Use `criterion='squared_error'` "
+                "which is equivalent.",
+                FutureWarning
+            )
 
         if not self.bootstrap and self.oob_score:
             raise ValueError("Out of bag estimation only available"
@@ -1310,15 +1321,19 @@ class RandomForestRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"mse", "mae"}, default="mse"
+    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion, and "mae" for the mean
-        absolute error.
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion, and "mae" for the
+        mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -1537,7 +1552,7 @@ class RandomForestRegressor(ForestRegressor):
     @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
@@ -1921,15 +1936,19 @@ class ExtraTreesRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"mse", "mae"}, default="mse"
+    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion, and "mae" for the mean
-        absolute error.
+        are "squared_error" and "mse" for the mean squared error, which is
+        equal to variance reduction as feature selection criterion, and "mae"
+        for the mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -2141,7 +2160,7 @@ class ExtraTreesRegressor(ForestRegressor):
     @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
@@ -2353,7 +2372,7 @@ class RandomTreesEmbedding(BaseForest):
            [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
     """
 
-    criterion = 'mse'
+    criterion = "squared_error"
     max_features = 1
 
     @_deprecate_positional_args