Skip to content

MNT Use const memory views in DistanceMetric subclasses #19883

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ Changelog
- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.

- |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.

:mod:`sklearn.compose`
......................

Expand Down Expand Up @@ -306,6 +309,9 @@ Changelog
:pr:`19473` by :user:`jiefangxuanyan <jiefangxuanyan>` and
:user:`Julien Jerphanion <jjerphan>`.

- |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.

:mod:`sklearn.pipeline`
.......................

Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/_hierarchical_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ def single_linkage_label(L):
@cython.boundscheck(False)
@cython.nonecheck(False)
def mst_linkage_core(
DTYPE_t [:, ::1] raw_data,
const DTYPE_t [:, ::1] raw_data,
DistanceMetric dist_metric):
"""
Compute the necessary elements of a minimum spanning
Expand Down
44 changes: 41 additions & 3 deletions sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
# Matteo Visconti di Oleggio Castello 2014
# License: BSD 3 clause
import itertools
from tempfile import mkdtemp
import shutil
import pytest
Expand All @@ -15,7 +16,11 @@
from scipy.cluster import hierarchy

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
from sklearn.utils._testing import (
assert_almost_equal,
create_memmap_backed_data
)
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import ignore_warnings

Expand All @@ -28,8 +33,12 @@
from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
manhattan_distances, pairwise_distances
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster._hierarchical_fast import average_merge, max_merge
from sklearn.neighbors import kneighbors_graph, DistanceMetric
from sklearn.cluster._hierarchical_fast import (
average_merge,
max_merge,
mst_linkage_core
)
from sklearn.utils._fast_dict import IntFloatDict
from sklearn.utils._testing import assert_array_equal
from sklearn.datasets import make_moons, make_circles
Expand Down Expand Up @@ -264,6 +273,16 @@ def test_agglomerative_clustering():
assert_array_equal(clustering.labels_, clustering2.labels_)


def test_agglomerative_clustering_memory_mapped():
"""AgglomerativeClustering must work on mem-mapped dataset.

Non-regression test for issue #19875.
"""
rng = np.random.RandomState(0)
Xmm = create_memmap_backed_data(rng.randn(50, 100))
AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)


def test_ward_agglomeration():
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(0)
Expand Down Expand Up @@ -375,6 +394,25 @@ def test_vector_scikit_single_vs_scipy_single(seed):
assess_same_labelling(cut, cut_scipy)


@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
def test_mst_linkage_core_memory_mapped(metric):
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.

Non-regression test for issue #19875.
"""
rng = np.random.RandomState(seed=1)
X = rng.normal(size=(20, 4))
Xmm = create_memmap_backed_data(X)
argdict = METRICS_DEFAULT_PARAMS[metric]
keys = argdict.keys()
for vals in itertools.product(*argdict.values()):
kwargs = dict(zip(keys, vals))
distance_metric = DistanceMetric.get_metric(metric, **kwargs)
mst = mst_linkage_core(X, distance_metric)
mst_mm = mst_linkage_core(Xmm, distance_metric)
np.testing.assert_equal(mst, mst_mm)


def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
Expand Down
10 changes: 5 additions & 5 deletions sklearn/neighbors/_dist_metrics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ from ._typedefs import DTYPE, ITYPE
#
# We use these for the default (euclidean) case so that they can be
# inlined. This leads to faster computation for the most common case
cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
cdef inline DTYPE_t euclidean_dist(const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1:
cdef DTYPE_t tmp, d=0
cdef np.intp_t j
Expand All @@ -25,7 +25,7 @@ cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
return sqrt(d)


cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
cdef inline DTYPE_t euclidean_rdist(const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1:
cdef DTYPE_t tmp, d=0
cdef np.intp_t j
Expand All @@ -35,11 +35,11 @@ cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
return d


cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
cdef inline DTYPE_t euclidean_dist_to_rdist(const DTYPE_t dist) nogil except -1:
return dist * dist


cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) nogil except -1:
cdef inline DTYPE_t euclidean_rdist_to_dist(const DTYPE_t dist) nogil except -1:
return sqrt(dist)


Expand All @@ -61,7 +61,7 @@ cdef class DistanceMetric:
cdef object func
cdef object kwargs

cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1

cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
Expand Down
Loading