KuangDW
add alignment and specify encoder
dd05f29
# coding=utf-8
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluates the predictions from a MetricX model."""
import dataclasses
import json
import os
from typing import Any, Tuple
from mt_metrics_eval import data
from mt_metrics_eval import stats
from mt_metrics_eval import tau_optimization
import numpy as np
import scipy.stats
import transformers
@dataclasses.dataclass
class Arguments:
dataset: str = dataclasses.field(metadata={"help": "The MTME dataset."})
lp: str = dataclasses.field(metadata={"help": "The language pair."})
input_file: str = dataclasses.field(metadata={"help": "The input file."})
output_file: str = dataclasses.field(
metadata={"help": "The output file with evaluation metrics."},
)
def _convert_to_matrices(
instances: list[dict[str, Any]]
) -> Tuple[np.ndarray, np.ndarray]:
"""Converts the instances to metric and human score matrices."""
system_id_to_row = {}
segment_id_to_col = {}
for instance in instances:
system_id = instance["system_id"]
segment_id = instance["segment_id"]
if system_id not in system_id_to_row:
system_id_to_row[system_id] = len(system_id_to_row)
if segment_id not in segment_id_to_col:
segment_id_to_col[segment_id] = len(segment_id_to_col)
num_rows = len(system_id_to_row)
num_cols = len(segment_id_to_col)
# MTME requires that missing scores must be None, not NaN.
metric_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))
human_scores = np.full((num_rows, num_cols), None, dtype=np.dtype(object))
for instance in instances:
system_id = instance["system_id"]
segment_id = instance["segment_id"]
row = system_id_to_row[system_id]
col = segment_id_to_col[segment_id]
metric_scores[row, col] = (
-1 * instance["prediction"]
) # negate so higher is better
human_scores[row, col] = instance["label"]
return metric_scores, human_scores
def main() -> None:
parser = transformers.HfArgumentParser(Arguments)
(args,) = parser.parse_args_into_dataclasses()
# Download MTME data
data.Download()
# Load the data and filter outliers, the human system corresponding to the
# references, and any system that doesn't have any MQM scores.
evs = data.EvalSet(args.dataset, args.lp)
bad_systems = {evs.std_ref} | evs.outlier_sys_names
mqm = evs.Scores("seg", "mqm")
for system_id, scores in mqm.items():
if not any(score is not None for score in scores):
bad_systems.add(system_id)
instances = []
with open(args.input_file, "r") as f:
for line in f:
instance = json.loads(line)
if instance["system_id"] in bad_systems:
continue
instances.append(instance)
metric_seg_scores, human_seg_scores = _convert_to_matrices(instances)
metric_sys_scores = np.mean(metric_seg_scores, axis=1)
human_sys_scores = np.apply_along_axis(
lambda row: np.mean(row[row != None]), 1, human_seg_scores # pylint: disable=singleton-comparison
)
# Segment-level correlations.
mask = human_seg_scores.reshape(-1) != None # pylint: disable=singleton-comparison
seg_no_grouping_pearson, _ = scipy.stats.pearsonr(
metric_seg_scores.reshape(-1)[mask],
human_seg_scores.reshape(-1)[mask],
)
tie_calib_result = tau_optimization.tau_optimization(
metric_seg_scores.T,
human_seg_scores.T,
tau_optimization.TauSufficientStats.acc_23,
)
# System-level correlations.
sys_pearson, _ = scipy.stats.pearsonr(human_sys_scores, metric_sys_scores)
agree, num_pairs = stats.Agreement(human_sys_scores, metric_sys_scores)
sys_accuracy = agree / num_pairs
sys_spa = stats.PairwiseConfidenceError(
human_seg_scores.reshape(-1),
metric_seg_scores.reshape(-1),
human_seg_scores.shape[0],
filter_nones=True,
)[0]
metrics = {
"system_level": {
"pearson": sys_pearson,
"accuracy": sys_accuracy,
"spa": sys_spa,
},
"segment_level_no_grouping": {
"pearson": seg_no_grouping_pearson,
},
"segment_level_group_by_item": {
"accuracy": tie_calib_result.best_tau,
"epsilon": tie_calib_result.best_threshold,
},
}
print(json.dumps(metrics, indent=2))
if args.output_file:
dirname = os.path.dirname(args.output_file)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(args.output_file, "w") as out:
out.write(json.dumps(metrics))
if __name__ == "__main__":
main()