Skip to content

Commit

Permalink
Fix prediction data not honoring cluster_selection_epsilon
Browse files Browse the repository at this point in the history
  • Loading branch information
n9Mtq4 committed Mar 22, 2023
1 parent e55f957 commit 5a4944a
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 6 deletions.
5 changes: 4 additions & 1 deletion hdbscan/_hdbscan_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,9 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
stabilities : ndarray (n_clusters,)
The cluster coherence strengths of each cluster.
selected clusters : ndarray (n_clusters,)
The ids of the selected clusters
"""
cdef list node_list
cdef np.ndarray cluster_tree
Expand Down Expand Up @@ -803,4 +806,4 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
probs = get_probabilities(tree, reverse_cluster_map, labels)
stabilities = get_stability_scores(labels, clusters, stability, max_lambda)

return (labels, probs, stabilities)
return (labels, probs, stabilities, np.array(sorted(clusters)))
3 changes: 2 additions & 1 deletion hdbscan/flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ def HDBSCAN_flat(X, n_clusters=None,
new_clusterer.probabilities_,
new_clusterer.cluster_persistence_,
new_clusterer._condensed_tree,
new_clusterer._single_linkage_tree) = output
new_clusterer._single_linkage_tree,
new_clusterer._selected_clusters) = output

# PredictionData attached to HDBSCAN should also change.
# A function re_init is defined in this module to handle this.
Expand Down
8 changes: 6 additions & 2 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _tree_to_labels(
"""
condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
stability_dict = compute_stability(condensed_tree)
labels, probabilities, stabilities = get_clusters(
labels, probabilities, stabilities, selected_clusters = get_clusters(
condensed_tree,
stability_dict,
cluster_selection_method,
Expand All @@ -72,7 +72,8 @@ def _tree_to_labels(
max_cluster_size,
)

return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree,
selected_clusters)


def _hdbscan_generic(
Expand Down Expand Up @@ -1130,6 +1131,7 @@ def __init__(
self._outlier_scores = None
self._prediction_data = None
self._relative_validity = None
self._selected_clusters = None

def fit(self, X, y=None):
"""Perform HDBSCAN clustering from features or distance matrix.
Expand Down Expand Up @@ -1186,6 +1188,7 @@ def fit(self, X, y=None):
self.cluster_persistence_,
self._condensed_tree,
self._single_linkage_tree,
self._selected_clusters,
self._min_spanning_tree,
) = hdbscan(clean_data, **kwargs)

Expand Down Expand Up @@ -1248,6 +1251,7 @@ def generate_prediction_data(self):
self._prediction_data = PredictionData(
self._raw_data,
self.condensed_tree_,
self._selected_clusters,
min_samples,
tree_type=tree_type,
metric=self.metric,
Expand Down
3 changes: 1 addition & 2 deletions hdbscan/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,14 @@ def _recurse_leaf_dfs(self, current_node):
return sum(
[recurse_leaf_dfs(self.cluster_tree, child) for child in children], [])

def __init__(self, data, condensed_tree, min_samples,
def __init__(self, data, condensed_tree, selected_clusters, min_samples,
tree_type='kdtree', metric='euclidean', **kwargs):
self.raw_data = data.astype(np.float64)
self.tree = self._tree_type_map[tree_type](self.raw_data,
metric=metric, **kwargs)
self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1]
self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)

selected_clusters = sorted(condensed_tree._select_clusters())
# raw_condensed_tree = condensed_tree.to_numpy()
raw_condensed_tree = condensed_tree._raw_tree

Expand Down

0 comments on commit 5a4944a

Please sign in to comment.