# Copyright 2025 the LlamaFactory team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import time import fire from datasets import load_dataset try: import jieba # type: ignore from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu # type: ignore from rouge_chinese import Rouge # type: ignore jieba.setLogLevel(logging.CRITICAL) jieba.initialize() except ImportError: print("Please install llamafactory with `pip install -e .[metrics]`.") raise def compute_metrics(sample): hypothesis = list(jieba.cut(sample["predict"])) reference = list(jieba.cut(sample["label"])) bleu_score = sentence_bleu( [list(sample["label"])], list(sample["predict"]), smoothing_function=SmoothingFunction().method3, ) if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} else: rouge = Rouge() scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) result = scores[0] metric_result = {} for k, v in result.items(): metric_result[k] = round(v["f"] * 100, 4) metric_result["bleu-4"] = round(bleu_score * 100, 4) return metric_result def main(filename: str): start_time = time.time() dataset = load_dataset("json", data_files=filename, split="train") dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names) score_dict = dataset.to_dict() average_score = {} for task, scores in sorted(score_dict.items(), key=lambda x: x[0]): print(f"{task}: {sum(scores) / len(scores):.4f}") average_score[task] = sum(scores) / len(scores) with open("predictions_score.json", "w", encoding="utf-8") as f: json.dump(average_score, f, indent=4) print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json") if __name__ == "__main__": fire.Fire(main)