scikit-learn · jnothman · Mar 20, 2021 · Oct 13, 2019 · Oct 13, 2019 · Oct 13, 2019
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1176,6 +1176,7 @@ Splitter Classes
    model_selection.ShuffleSplit
    model_selection.StratifiedKFold
    model_selection.StratifiedShuffleSplit
+   model_selection.StratifiedGroupKFold
    model_selection.TimeSeriesSplit
 
 Splitter Functions

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -353,7 +353,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples::
 Here is a visualization of the cross-validation behavior. Note that
 :class:`KFold` is not affected by classes or groups.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_004.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -509,7 +509,7 @@ Here is a usage example::
 Here is a visualization of the cross-validation behavior. Note that
 :class:`ShuffleSplit` is not affected by classes or groups.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -566,7 +566,7 @@ We can see that :class:`StratifiedKFold` preserves the class ratios
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -585,7 +585,7 @@ percentage for each target class as in the complete set.
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_012.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -645,6 +645,58 @@ size due to the imbalance in the data.
 
 Here is a visualization of the cross-validation behavior.
 
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png
+   :target: ../auto_examples/model_selection/plot_cv_indices.html
+   :align: center
+   :scale: 75%
+
+.. _stratified_group_k_fold:
+
+StratifiedGroupKFold
+^^^^^^^^^^^^^^^^^^^^
+
+:class:`StratifiedGroupKFold` is a cross-validation scheme that combines both
+:class:`StratifiedKFold` and :class:`GroupKFold`. The idea is to try to
+preserve the distribution of classes in each split while keeping each group
+within a single split. That might be useful when you have an unbalanced
+dataset so that using just :class:`GroupKFold` might produce skewed splits.
+
+Example::
+
+  >>> from sklearn.model_selection import StratifiedGroupKFold
+  >>> X = list(range(18))
+  >>> y = [1] * 6 + [0] * 12
+  >>> groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]
+  >>> sgkf = StratifiedGroupKFold(n_splits=3)
+  >>> for train, test in sgkf.split(X, y, groups=groups):
+  ...     print("%s %s" % (train, test))
+  [ 0  2  3  4  5  6  7 10 11 15 16 17] [ 1  8  9 12 13 14]
+  [ 0  1  4  5  6  7  8  9 11 12 13 14] [ 2  3 10 15 16 17]
+  [ 1  2  3  8  9 10 12 13 14 15 16 17] [ 0  4  5  6  7 11]
+
+Implementation notes:
+
+- With the current implementation full shuffle is not possible in most
+  scenarios. When shuffle=True, the following happens:
+
+  1. All groups a shuffled.
+  2. Groups are sorted by standard deviation of classes using stable sort.
+  3. Sorted groups are iterated over and assigned to folds.
+
+  That means that only groups with the same standard deviation of class
+  distribution will be shuffled, which might be useful when each group has only
+  a single class.
+- The algorithm greedily assigns each group to one of n_splits test sets,
+  choosing the test set that minimises the variance in class distribution
+  across test sets. Group assignment proceeds from groups with highest to
+  lowest variance in class frequency, i.e. large groups peaked on one or few
+  classes are assigned first.
+- This split is suboptimal in a sense that it might produce imbalanced splits
+  even if perfect stratification is possible. If you have relatively close
+  distribution of classes in each group, using :class:`GroupKFold` is better.
+
+Here is a visualization of cross-validation behavior for uneven groups:
+
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_005.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
@@ -733,7 +785,7 @@ Here is a usage example::
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_011.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -835,7 +887,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples::
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_010.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_013.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -183,6 +183,16 @@ Changelog
   are integral.
   :pr:`9843` by :user:`Jon Crall <Erotemic>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines
+  :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`,
+  providing an ability to split data preserving the distribution of classes in
+  each split while keeping each group within a single split.
+  :pr:`18649` by `Leandro Hermida <hermidalc>` and
+  `Rodion Martynov <marrodion>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 

diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py
@@ -13,7 +13,8 @@
 
 from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                      StratifiedKFold, GroupShuffleSplit,
-                                     GroupKFold, StratifiedShuffleSplit)
+                                     GroupKFold, StratifiedShuffleSplit,
+                                     StratifiedGroupKFold)
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch
@@ -113,16 +114,32 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 # %%
 # As you can see, by default the KFold cross-validation iterator does not
 # take either datapoint class or group into consideration. We can change this
-# by using the ``StratifiedKFold`` like so.
+# by using either:
+#
+# - ``StratifiedKFold`` to preserve the percentage of samples for each class.
+# - ``GroupKFold`` to ensure that the same group will not appear in two
+#   different folds.
+# - ``StratifiedGroupKFold`` to keep the constraint of ``GroupKFold`` while
+#   attempting to return stratified folds.
 
-fig, ax = plt.subplots()
-cv = StratifiedKFold(n_splits)
-plot_cv_indices(cv, X, y, groups, ax, n_splits)
+# To better demonstrate the difference, we will assign samples to groups
+# unevenly:
+
+uneven_groups = np.sort(np.random.randint(0, 10, n_points))
+
+cvs = [StratifiedKFold, GroupKFold, StratifiedGroupKFold]
+
+for cv in cvs:
+    fig, ax = plt.subplots(figsize=(6, 3))
+    plot_cv_indices(cv(n_splits), X, y, uneven_groups, ax, n_splits)
+    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
+              ['Testing set', 'Training set'], loc=(1.02, .8))
+    # Make the legend fit
+    plt.tight_layout()
+    fig.subplots_adjust(right=.7)
 
 # %%
-# In this case, the cross-validation retained the same ratio of classes across
-# each CV split. Next we'll visualize this behavior for a number of CV
-# iterators.
+# Next we'll visualize this behavior for a number of CV iterators.
 #
 # Visualize cross-validation indices for many CV objects
 # ------------------------------------------------------
@@ -133,7 +150,7 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 #
 # Note how some use the group/class information while others do not.
 
-cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold,
+cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, StratifiedGroupKFold,
        GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]
 
 

diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
@@ -14,6 +14,7 @@
 from ._split import ShuffleSplit
 from ._split import GroupShuffleSplit
 from ._split import StratifiedShuffleSplit
+from ._split import StratifiedGroupKFold
 from ._split import PredefinedSplit
 from ._split import train_test_split
 from ._split import check_cv
@@ -57,6 +58,7 @@
            'RandomizedSearchCV',
            'ShuffleSplit',
            'StratifiedKFold',
+           'StratifiedGroupKFold',
            'StratifiedShuffleSplit',
            'check_cv',
            'cross_val_predict',