Spaces:

nvidia
/

Plan2Align-NV

Sleeping

Plan2Align-NV / vecalign /metricx24 /evaluate.py

KuangDW

add alignment and specify encoder

dd05f29 25 days ago

5.02 kB

	# coding=utf-8
	# Copyright 2024 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Evaluates the predictions from a MetricX model."""

	import dataclasses
	import json
	import os
	from typing import Any, Tuple

	from mt_metrics_eval import data
	from mt_metrics_eval import stats
	from mt_metrics_eval import tau_optimization
	import numpy as np
	import scipy.stats
	import transformers


	@dataclasses.dataclass
	class Arguments:
	dataset: str = dataclasses.field(metadata={"help": "The MTME dataset."})

	lp: str = dataclasses.field(metadata={"help": "The language pair."})

	input_file: str = dataclasses.field(metadata={"help": "The input file."})

	output_file: str = dataclasses.field(
	metadata={"help": "The output file with evaluation metrics."},
	)


	def _convert_to_matrices(
	instances: list[dict[str, Any]]
	) -> Tuple[np.ndarray, np.ndarray]:
	"""Converts the instances to metric and human score matrices."""
	system_id_to_row = {}
	segment_id_to_col = {}

	for instance in instances:
	system_id = instance["system_id"]
	segment_id = instance["segment_id"]
	if system_id not in system_id_to_row:
	system_id_to_row[system_id] = len(system_id_to_row)
	if segment_id not in segment_id_to_col:
	segment_id_to_col[segment_id] = len(segment_id_to_col)

	num_rows = len(system_id_to_row)
	num_cols = len(segment_id_to_col)
	# MTME requires that missing scores must be None, not NaN.
	metric_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))
	human_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))

	for instance in instances:
	system_id = instance["system_id"]
	segment_id = instance["segment_id"]
	row = system_id_to_row[system_id]
	col = segment_id_to_col[segment_id]
	metric_scores[row, col] = (
	-1 * instance["prediction"]
	) # negate so higher is better
	human_scores[row, col] = instance["label"]

	return metric_scores, human_scores


	def main() -> None:
	parser = transformers.HfArgumentParser(Arguments)
	(args,) = parser.parse_args_into_dataclasses()

	# Download MTME data
	data.Download()

	# Load the data and filter outliers, the human system corresponding to the
	# references, and any system that doesn't have any MQM scores.
	evs = data.EvalSet(args.dataset, args.lp)
	bad_systems = {evs.std_ref} \| evs.outlier_sys_names
	mqm = evs.Scores("seg", "mqm")
	for system_id, scores in mqm.items():
	if not any(score is not None for score in scores):
	bad_systems.add(system_id)

	instances = []
	with open(args.input_file, "r") as f:
	for line in f:
	instance = json.loads(line)
	if instance["system_id"] in bad_systems:
	continue
	instances.append(instance)

	metric_seg_scores, human_seg_scores = _convert_to_matrices(instances)
	metric_sys_scores = np.mean(metric_seg_scores, axis=1)
	human_sys_scores = np.apply_along_axis(
	lambda row: np.mean(row[row != None]), 1, human_seg_scores # pylint: disable=singleton-comparison
	)

	# Segment-level correlations.
	mask = human_seg_scores.reshape(-1) != None # pylint: disable=singleton-comparison
	seg_no_grouping_pearson, _ = scipy.stats.pearsonr(
	metric_seg_scores.reshape(-1)[mask],
	human_seg_scores.reshape(-1)[mask],
	)
	tie_calib_result = tau_optimization.tau_optimization(
	metric_seg_scores.T,
	human_seg_scores.T,
	tau_optimization.TauSufficientStats.acc_23,
	)

	# System-level correlations.
	sys_pearson, _ = scipy.stats.pearsonr(human_sys_scores, metric_sys_scores)
	agree, num_pairs = stats.Agreement(human_sys_scores, metric_sys_scores)
	sys_accuracy = agree / num_pairs
	sys_spa = stats.PairwiseConfidenceError(
	human_seg_scores.reshape(-1),
	metric_seg_scores.reshape(-1),
	human_seg_scores.shape[0],
	filter_nones=True,
	)[0]

	metrics = {
	"system_level": {
	"pearson": sys_pearson,
	"accuracy": sys_accuracy,
	"spa": sys_spa,
	},
	"segment_level_no_grouping": {
	"pearson": seg_no_grouping_pearson,
	},
	"segment_level_group_by_item": {
	"accuracy": tie_calib_result.best_tau,
	"epsilon": tie_calib_result.best_threshold,
	},
	}
	print(json.dumps(metrics, indent=2))

	if args.output_file:
	dirname = os.path.dirname(args.output_file)
	if dirname:
	os.makedirs(dirname, exist_ok=True)
	with open(args.output_file, "w") as out:
	out.write(json.dumps(metrics))


	if __name__ == "__main__":
	main()