Spaces:

symanto
/

clustering_evaluator

Sleeping

clustering_evaluator / clustering_evaluator.py

José Ángel González

first commit

509822c 8 months ago

3.09 kB

	import datasets
	import evaluate
	from sklearn.metrics import (
	adjusted_mutual_info_score,
	adjusted_rand_score,
	calinski_harabasz_score,
	completeness_score,
	davies_bouldin_score,
	fowlkes_mallows_score,
	homogeneity_score,
	silhouette_score,
	)
	from sklearn.metrics.cluster import contingency_matrix, pair_confusion_matrix

	_CITATION = """
	@article{scikit-learn,
	title={Scikit-learn: Machine Learning in {P}ython},
	author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
	and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
	and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
	Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
	journal={Journal of Machine Learning Research},
	volume={12},
	pages={2825--2830},
	year={2011}
	}
	"""


	_DESCRIPTION = """\
	This evaluator computes multiple clustering metrics to assess the quality of a clustering.
	"""


	_KWARGS_DESCRIPTION = """
	Computes the quality of clustering results.
	Args:
	samples' vector representations
	y: computed cluster labels
	Returns:
	silhouete_score (float): cohesiveness and separation between clusters
	davies_bouldin_score (float): average similarity measure of each cluster with its most similar cluster
	calinski_harabasz_score (float): ratio of the sum of between-cluster dispersion and of within-cluster dispersion
	"""


	@evaluate.utils.file_utils.add_start_docstrings(
	_DESCRIPTION, _KWARGS_DESCRIPTION
	)
	class ClusteringEvaluator(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"samples": datasets.Sequence(datasets.Value("float32")),
	"predictions": datasets.Value("int64"),
	}
	),
	)

	def _compute(self, samples, predictions, truth_labels=None):
	unsupervised_metrics = [
	silhouette_score,
	davies_bouldin_score,
	calinski_harabasz_score,
	]
	supervised_metrics = [
	adjusted_rand_score,
	adjusted_mutual_info_score,
	homogeneity_score,
	completeness_score,
	fowlkes_mallows_score,
	contingency_matrix,
	pair_confusion_matrix,
	]
	results = {}

	# Compute unsupervised metrics always
	for fn in unsupervised_metrics:
	results[fn.__name__] = float(fn(samples, predictions))

	# Compute supervised metrics if reference labels are passed
	if truth_labels is not None:
	for fn in supervised_metrics:
	score = fn(truth_labels, predictions)
	try:
	score = float(score)
	except (AttributeError, TypeError):
	pass
	results[fn.__name__] = score
	return results