Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

Bagratuni commited on Mar 10

Commit

779cbde

1 Parent(s): 1b75b9d

commit

Browse files

Files changed (7) hide show

app.py +114 -107
data_handler.py +110 -0
logo.png +0 -0
mmlu_pro_hy_results.csv +0 -8
model_handler.py +80 -0
model_results.json +581 -0
unified_exam_results.csv +0 -10

app.py CHANGED Viewed

@@ -1,115 +1,122 @@
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-def display_table(exam_type):
-    if exam_type == "Armenian Exams":
-        df = pd.read_csv('unified_exam_results.csv')
-        df = df.sort_values(by='Average', ascending=False)
-        cols = df.columns.tolist()
-        cols.insert(1, cols.pop(cols.index('Average')))
-        df = df[cols]
-        df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
-        df = df.round(4)
-    elif exam_type == "MMLU-Pro-Hy":
-        df = pd.read_csv('mmlu_pro_hy_results.csv')
-        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
-        df['Average'] = df[subject_cols].mean(axis=1)
-        df = df.sort_values(by='Average', ascending=False)
-        cols = df.columns.tolist()
-        cols.remove('Accuracy')
-        cols.insert(1, cols.pop(cols.index('Average')))
-        cols.append(cols.pop(cols.index('Other')))
-        df = df[cols]
-        df = df.round(4)
-    return df
-def create_bar_chart(exam_type, plot_column):
-    if exam_type == "Armenian Exams":
-        df = pd.read_csv('unified_exam_results.csv')
-        df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
-        x_col = plot_column
-        title = f'{plot_column}'
-        x_range_max = 20
-        def get_label(score):
-            if score < 8:
-                return "Fail"
-            elif 8 <= score <= 18:
-                return "Pass"
-            else:
-                return "Distinction"
-        df['Test Result'] = df[plot_column].apply(get_label)
-        color_discrete_map = {
-            "Fail": "#ff5f56",
-            "Pass": "#ffbd2e",
-            "Distinction": "#27c93f"
-        }
-        fig = px.bar(df,
-            x=x_col,
-            y='Model',
-            color=df['Test Result'],
-            color_discrete_map=color_discrete_map,
-            labels={x_col: 'Score', 'Model': 'Model'},
-            title=title,
-            orientation='h')
-        fig.update_layout(
-            xaxis=dict(range=[0, x_range_max]),
-            title=dict(text=title, font=dict(size=16)),
-            xaxis_title=dict(font=dict(size=12)),
-            yaxis_title=dict(font=dict(size=12)),
-            yaxis=dict(autorange="reversed"),
-            autosize=True
-        )
-        return fig
-    elif exam_type == "MMLU-Pro-Hy":
-        df = pd.read_csv('mmlu_pro_hy_results.csv')
-        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
-        df['Average'] = df[subject_cols].mean(axis=1)
-        df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)
-        df = df.drop(columns=['Accuracy'])
-        x_col = plot_column
-        title = f'{plot_column}'
-        x_range_max = 1.0
-        fig = px.bar(df,
-            x=x_col,
-            y='Model',
-            color=x_col,
-            color_continuous_scale='Viridis',
-            labels={x_col: 'Accuracy', 'Model': 'Model'},
-            title=title,
-            orientation='h',
-            range_color=[0,1])
-        fig.update_layout(
-            xaxis=dict(range=[0, x_range_max]),
-            title=dict(text=title, font=dict(size=16)),
-            xaxis_title=dict(font=dict(size=12)),
-            yaxis_title=dict(font=dict(size=12)),
-            yaxis=dict(autorange="reversed"),
-            autosize=True
-        )
-        return fig
-with gr.Blocks() as app:
-    with gr.Tabs():
-        with gr.TabItem("Armenian Unified Exams"):
-            gr.Markdown("# Armenian Unified Test Exams")
-            gr.HTML(f"""
-                <div style="font-size: 16px;">
                     This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
-                </div>
-            """)
-            table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
-            plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
-            plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
-        with gr.TabItem("MMLU-Pro-Hy"):
-            gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
-            gr.HTML(f"""
-                <div style="font-size: 16px;">
                     This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
-                </div>
-            """)
-            table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
-            subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
-            plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
-            plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
-app.launch(share=True, debug=True)

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+from model_handler import ModelHandler
+from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
+global_unified_exam_df = None
+global_mmlu_df = None
+global_output_armenian = None
+global_output_mmlu = None
+def refresh_data():
+    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
+    model_handler = ModelHandler()
+    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
+    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
+    global_output_mmlu = mmlu_result_table(global_mmlu_df)
+    return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
+def main():
+    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
+    model_handler = ModelHandler()
+    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
+    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
+    global_output_mmlu = mmlu_result_table(global_mmlu_df)
+    with gr.Blocks() as app:
+        with gr.Tabs():
+            with gr.TabItem("Armenian Unified Exams"):
+                gr.Markdown("# Armenian Unified Test Exams")
+                gr.Markdown(
+                    """
                     This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
+                    """
+                )
+                table_output_armenian = gr.DataFrame(value=global_output_armenian)
+                plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
+                plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
+            with gr.TabItem("MMLU-Pro-Hy"):
+                gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
+                gr.Markdown(
+                    """
                     This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
+                    """
+                )
+                table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
+                subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
+                plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
+                plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
+            with gr.TabItem("About"):
+                gr.Markdown("# About the Benchmark")
+                gr.Markdown(
+                    """
+                    This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.
+                    **Creator Company:** Metric AI Research Lab, Yerevan, Armenia."""
+                )
+                gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
+                gr.Markdown("""
+                    - [Website](https://metric.am/)
+                    - [Hugging Face](https://huggingface.co/Metric-AI)
+                    MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
+                    """
+                )
+                gr.Markdown("## Submission Guide")
+                gr.Markdown(
+                    """
+                    To submit a model for evaluation, please follow these steps:
+                    1. **Evaluate your model**:
+                       - Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
+                    2. **Format your submission file**:
+                        - After evaluation, you will get a `result.json` file. Ensure the file follows this format:
+                        ```json
+                        {
+                            "mmlu_results": [
+                                {
+                                    "category": "category_name",
+                                    "score": score_value
+                                },
+                                ...
+                            ],
+                            "unified_exam_results": [
+                                {
+                                    "category": "category_name",
+                                    "score": score_value
+                                },
+                                ...
+                            ]
+                        }
+                        ```
+                    3. **Submit your model**:
+                        - Add the `arm_bench` tag and the `result.json` file to your model card.
+                        - Click on the "Refresh Data" button in this app, and you will see your model's results.
+                    """
+                )
+                gr.Markdown("## Contributing")
+                gr.Markdown(
+                    """
+                    You can contribute to this benchmark in several ways:
+                    - Providing API credits for evaluating API-based models.
+                    - Citing our work in your research and publications.
+                    - Contributing to the development of the benchmark itself.
+                    """
+                )
+        refresh_button = gr.Button("Refresh Data")
+        refresh_button.click(
+            fn=refresh_data,
+            outputs=[table_output_armenian,
+                     table_output_mmlu,
+                     plot_output_armenian,
+                     plot_output_mmlu],
+        )
+    app.launch(share=True, debug=True)
+if __name__ == "__main__":
+    main()

data_handler.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from model_handler import ModelHandler
+def unified_exam_result_table(unified_exam_df):
+    df = unified_exam_df.copy()
+    numeric_columns = df.select_dtypes(include=["number"])
+    df["Average"] = numeric_columns.mean(axis=1)
+    df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
+    df.insert(0, 'Rank', range(1, len(df) + 1))
+    cols = df.columns.tolist()
+    cols.insert(2, cols.pop(cols.index('Average')))
+    df = df[cols]
+    df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True)
+    df = df.round(4)
+    return df
+def mmlu_result_table(mmlu_df):
+    df = mmlu_df.copy()
+    numeric_columns = df.select_dtypes(include=["number"])
+    df["Average"] = numeric_columns.mean(axis=1)
+    df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
+    df.insert(0, 'Rank', range(1, len(df) + 1))
+    cols = df.columns.tolist()
+    cols.insert(2, cols.pop(cols.index('Average')))
+    cols.append(cols.pop(cols.index('Other')))
+    df = df[cols]
+    df = df.round(4)
+    return df
+def unified_exam_chart(unified_exam_df, plot_column):
+    if plot_column == 'Armenian language and literature':
+        plot_column = 'Armenian language\nand literature'
+    df = unified_exam_df.copy()
+    df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
+    x_col = plot_column
+    title = f'{plot_column}'
+    x_range_max = 20
+    def get_label(score):
+        if score < 8:
+            return "Fail"
+        elif 8 <= score <= 18:
+            return "Pass"
+        else:
+            return "Distinction"
+    df['Test Result'] = df[plot_column].apply(get_label)
+    color_discrete_map = {
+        "Fail": "#ff5f56",
+        "Pass": "#ffbd2e",
+        "Distinction": "#27c93f"
+    }
+    fig = px.bar(df,
+        x=x_col,
+        y='Model',
+        color=df['Test Result'],
+        color_discrete_map=color_discrete_map,
+        labels={x_col: 'Score', 'Model': 'Model'},
+        title=title,
+        orientation='h'
+    )
+    # max_chart_height = 600
+    # chart_height = df.shape[0] * 50
+    # chart_height = min(chart_height, max_chart_height)
+    fig.update_layout(
+        xaxis=dict(range=[0, x_range_max]),
+        title=dict(text=title, font=dict(size=16)),
+        xaxis_title=dict(font=dict(size=12)),
+        yaxis_title=dict(font=dict(size=12)),
+        yaxis=dict(autorange="reversed"),
+        # height=chart_height,
+        width=1400
+    )
+    return fig
+def mmlu_chart(mmlu_df, plot_column):
+    df = mmlu_df.copy()
+    subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
+    df['Average'] = df[subject_cols].mean(axis=1)
+    df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True)
+    x_col = plot_column
+    title = f'{plot_column}'
+    x_range_max = 1.0
+    fig = px.bar(df,
+        x=x_col,
+        y='Model',
+        color=x_col,
+        color_continuous_scale='Viridis',
+        labels={x_col: 'Accuracy', 'Model': 'Model'},
+        title=title,
+        orientation='h',
+        range_color=[0,1]
+    )
+    # max_chart_height = 600
+    # chart_height = df.shape[0] * 50
+    # chart_height = min(chart_height, max_chart_height)
+    fig.update_layout(
+        xaxis=dict(range=[0, x_range_max]),
+        title=dict(text=title, font=dict(size=16)),
+        xaxis_title=dict(font=dict(size=12)),
+        yaxis_title=dict(font=dict(size=12)),
+        yaxis=dict(autorange="reversed"),
+        # height=chart_height,
+        width=1400
+    )
+    return fig

logo.png ADDED Viewed

mmlu_pro_hy_results.csv DELETED Viewed

@@ -1,8 +0,0 @@
-Model,Accuracy,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Health,History,Law,Math,Other,Philosophy,Physics,Psychology
-gpt-4o,0.685,0.8667,0.7424,0.6842,0.6176,0.7887,0.5625,0.7794,0.5517,0.5393,0.7788,0.5974,0.5476,0.6881,0.7164
-claude-3-5-haiku-20241022,0.522,0.75,0.5758,0.5579,0.4412,0.6901,0.4125,0.5882,0.5172,0.2472,0.6018,0.3636,0.4048,0.5596,0.5672
-claude-3-5-sonnet-20241022,0.701,0.8667,0.803,0.7579,0.7059,0.7887,0.5625,0.6618,0.6552,0.4944,0.7788,0.6494,0.5476,0.7523,0.7164
-DeepSeek-V3,0.672,0.8167,0.8182,0.6947,0.7353,0.7887,0.5875,0.6471,0.4828,0.3596,0.8584,0.5455,0.5476,0.6881,0.7164
-gemini-1.5-flash,0.579,0.75,0.7121,0.6947,0.5,0.7183,0.4,0.5,0.4483,0.2584,0.8319,0.3506,0.3571,0.6514,0.6567
-gemini-2.0-flash,0.737,0.85,0.8182,0.7895,0.7353,0.8169,0.6,0.75,0.5517,0.5281,0.8673,0.6364,0.6429,0.7982,0.7612
-Meta-Llama-3.3-70B-Instruct,0.523,0.7333,0.5303,0.5895,0.3824,0.6338,0.4875,0.5735,0.4138,0.3146,0.6018,0.3377,0.4524,0.5321,0.6119

model_handler.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import os
+from typing import Any, Dict
+import pandas as pd
+from huggingface_hub import HfApi, hf_hub_download
+class ModelHandler:
+    def __init__(self, model_infos_path="D:\Vscode\llm_benchmark_space\ArmBen\model_results.json"):
+        self.api = HfApi()
+        self.model_infos_path = model_infos_path
+        self.model_infos = self._load_model_infos()
+    def _load_model_infos(self) -> Dict:
+        if os.path.exists(self.model_infos_path):
+            with open(self.model_infos_path) as f:
+                return json.load(f)
+        return {}
+    def _save_model_infos(self):
+        print("Saving model infos")
+        with open(self.model_infos_path, "w") as f:
+            json.dump(self.model_infos, f, indent=4)
+    def get_arm_bench_data(self):
+        models = self.api.list_models(filter="arm_llm")
+        model_names = {model["model_name"] for model in self.model_infos}
+        repositories = [model.modelId for model in models]
+        for repo_id in repositories:
+            files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"]
+            if not files:
+                continue
+            for file in files:
+                model_name = repo_id
+                if model_name not in model_names:
+                    try:
+                        result_path = hf_hub_download(repo_id, filename=file)
+                        with open(result_path) as f:
+                            results = json.load(f)
+                        self.model_infos.append({
+                            "model_name": model_name,
+                            "results": results
+                        })
+                    except Exception as e:
+                        print(f"Error loading {model_name} - {e}")
+                        continue
+        self._save_model_infos()
+        mmlu_data = []
+        unified_exam_data = []
+        for model in self.model_infos:
+            model_name = model["model_name"]
+            results = model.get("results", {})
+            mmlu_results = results.get("mmlu_results", [])
+            unified_exam_results = results.get("unified_exam_results", [])
+            if mmlu_results:
+                mmlu_row = {"Model": model_name}
+                for result in mmlu_results:
+                    mmlu_row[result["category"]] = result["score"]
+                mmlu_data.append(mmlu_row)
+            if unified_exam_results:
+                unified_exam_row = {"Model": model_name}
+                for result in unified_exam_results:
+                    unified_exam_row[result["category"]] = result["score"]
+                unified_exam_data.append(unified_exam_row)
+        mmlu_df = pd.DataFrame(mmlu_data)
+        unified_exam_df = pd.DataFrame(unified_exam_data)
+        return mmlu_df, unified_exam_df

model_results.json ADDED Viewed

	@@ -0,0 +1,581 @@

+[
+    {
+        "model_name": "claude-3-7-sonnet-20250219",
+        "results": {
+            "mmlu_results": [],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 10.5
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 7.75
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 15.0
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "claude-3-5-sonnet-20241022",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.8667
+                },
+                {
+                    "category": "Business",
+                    "score": 0.803
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.7579
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.7059
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.7887
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.5625
+                },
+                {
+                    "category": "Health",
+                    "score": 0.6618
+                },
+                {
+                    "category": "History",
+                    "score": 0.6552
+                },
+                {
+                    "category": "Law",
+                    "score": 0.4944
+                },
+                {
+                    "category": "Math",
+                    "score": 0.7788
+                },
+                {
+                    "category": "Other",
+                    "score": 0.6494
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.5476
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.7523
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.7164
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 10.0
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 9.25
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 12.75
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "gemini-2.0-flash",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.85
+                },
+                {
+                    "category": "Business",
+                    "score": 0.8182
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.7895
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.7353
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.8169
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.6
+                },
+                {
+                    "category": "Health",
+                    "score": 0.75
+                },
+                {
+                    "category": "History",
+                    "score": 0.5517
+                },
+                {
+                    "category": "Law",
+                    "score": 0.5281
+                },
+                {
+                    "category": "Math",
+                    "score": 0.8673
+                },
+                {
+                    "category": "Other",
+                    "score": 0.6364
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.6429
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.7982
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.7612
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 5.5
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 6.75
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 17.25
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "gpt-4o",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.8667
+                },
+                {
+                    "category": "Business",
+                    "score": 0.7424
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.6842
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.6176
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.7887
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.5625
+                },
+                {
+                    "category": "Health",
+                    "score": 0.7794
+                },
+                {
+                    "category": "History",
+                    "score": 0.5517
+                },
+                {
+                    "category": "Law",
+                    "score": 0.5393
+                },
+                {
+                    "category": "Math",
+                    "score": 0.7788
+                },
+                {
+                    "category": "Other",
+                    "score": 0.5974
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.5476
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.6881
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.7164
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 6.75
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 6.75
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 13.25
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "qwen-max-2025-01-25",
+        "results": {
+            "mmlu_results": [],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 7.25
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 4.5
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 14.25
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "gemini-1.5-flash",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.75
+                },
+                {
+                    "category": "Business",
+                    "score": 0.7121
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.6947
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.5
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.7183
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.4
+                },
+                {
+                    "category": "Health",
+                    "score": 0.5
+                },
+                {
+                    "category": "History",
+                    "score": 0.4483
+                },
+                {
+                    "category": "Law",
+                    "score": 0.2584
+                },
+                {
+                    "category": "Math",
+                    "score": 0.8319
+                },
+                {
+                    "category": "Other",
+                    "score": 0.3506
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.3571
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.6514
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.6567
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 4.75
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 3.75
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 15.0
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "DeepSeek-V3",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.8167
+                },
+                {
+                    "category": "Business",
+                    "score": 0.8182
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.6947
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.7353
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.7887
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.5875
+                },
+                {
+                    "category": "Health",
+                    "score": 0.6471
+                },
+                {
+                    "category": "History",
+                    "score": 0.4828
+                },
+                {
+                    "category": "Law",
+                    "score": 0.3596
+                },
+                {
+                    "category": "Math",
+                    "score": 0.8584
+                },
+                {
+                    "category": "Other",
+                    "score": 0.5455
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.5476
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.6881
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.7164
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 5.25
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 5.0
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 12.25
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "Meta-Llama-3.3-70B-Instruct",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.7333
+                },
+                {
+                    "category": "Business",
+                    "score": 0.5303
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.5895
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.3824
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.6338
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.4875
+                },
+                {
+                    "category": "Health",
+                    "score": 0.5735
+                },
+                {
+                    "category": "History",
+                    "score": 0.4138
+                },
+                {
+                    "category": "Law",
+                    "score": 0.3146
+                },
+                {
+                    "category": "Math",
+                    "score": 0.6018
+                },
+                {
+                    "category": "Other",
+                    "score": 0.3377
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.4524
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.5321
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.6119
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 4.5
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 5.25
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 11.5
+                }
+            ]
+        }
+    },
+    {
+        "model_name": "claude-3-5-haiku-20241022",
+        "results": {
+            "mmlu_results": [
+                {
+                    "category": "Biology",
+                    "score": 0.75
+                },
+                {
+                    "category": "Business",
+                    "score": 0.5758
+                },
+                {
+                    "category": "Chemistry",
+                    "score": 0.5579
+                },
+                {
+                    "category": "Computer Science",
+                    "score": 0.4412
+                },
+                {
+                    "category": "Economics",
+                    "score": 0.6901
+                },
+                {
+                    "category": "Engineering",
+                    "score": 0.4125
+                },
+                {
+                    "category": "Health",
+                    "score": 0.5882
+                },
+                {
+                    "category": "History",
+                    "score": 0.5172
+                },
+                {
+                    "category": "Law",
+                    "score": 0.2472
+                },
+                {
+                    "category": "Math",
+                    "score": 0.6018
+                },
+                {
+                    "category": "Other",
+                    "score": 0.3636
+                },
+                {
+                    "category": "Philosophy",
+                    "score": 0.4048
+                },
+                {
+                    "category": "Physics",
+                    "score": 0.5596
+                },
+                {
+                    "category": "Psychology",
+                    "score": 0.5672
+                }
+            ],
+            "unified_exam_results": [
+                {
+                    "category": "Armenian language and literature",
+                    "score": 5.0
+                },
+                {
+                    "category": "Armenian history",
+                    "score": 3.75
+                },
+                {
+                    "category": "Mathematics",
+                    "score": 10.75
+                }
+            ]
+        }
+    }
+]

unified_exam_results.csv DELETED Viewed

@@ -1,10 +0,0 @@
-Model,Armenian language and literature,Armenian history,Mathematics,Average
-claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
-claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
-gemini-2.0-flash,5.5,6.75,17.25,9.83
-gpt-4o,6.75,6.75,13.25,8.92
-qwen-max-2025-01-25,7.25,4.5,14.25,8.67
-gemini-1.5-flash,4.75,3.75,15.0,7.83
-DeepSeek-V3,5.25,5.0,12.25,7.5
-Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
-claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5