Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

daniel7an commited on Mar 1

Commit

0d208c0

1 Parent(s): 4781b83

updates

Browse files

Files changed (3) hide show

app.py +60 -67
mmlu_pro_hy_results.csv +8 -5
unified_exam_results.csv +1 -1

app.py CHANGED Viewed

@@ -5,62 +5,52 @@ import plotly.express as px
 def display_table(exam_type):
     if exam_type == "Armenian Exams":
         df = pd.read_csv('unified_exam_results.csv')
-        df = df.sort_values(by='Average score', ascending=False)
         cols = df.columns.tolist()
-        cols.insert(1, cols.pop(cols.index('Average score')))
         df = df[cols]
     elif exam_type == "MMLU-Pro-Hy":
         df = pd.read_csv('mmlu_pro_hy_results.csv')
-        df = df.sort_values(by='Accuracy', ascending=False)
     return df
 def create_bar_chart(exam_type, plot_column):
     if exam_type == "Armenian Exams":
         df = pd.read_csv('unified_exam_results.csv')
-        df = df.sort_values(by='Average score', ascending=False)
         df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
         x_col = plot_column
-        title = f'{plot_column} per Model'
-        if plot_column == 'Average score':
-            range_max = 20
-            x_range_max = 20
-        else:
-            range_max = 20
-            x_range_max = 20
-            def get_label(score):
-                if score < 8:
-                    return "Fail"
-                elif 8 <= score <= 18:
-                    return "Pass"
-                else:
-                    return "Distinction"
-            df['Test Result'] = df[plot_column].apply(get_label)
-        if plot_column in ['Average score', 'Accuracy']:
-            fig = px.bar(df,
-                x=x_col,
-                y='Model',
-                color=x_col,
-                color_continuous_scale='tealrose_r',
-                labels={x_col: plot_column, 'Model': 'Model'},
-                title=title,
-                orientation='h',
-                range_color=[0, range_max])
-        else:
-            color_discrete_map = {
-                "Fail": "#d15d80",
-                "Pass": "#edd8be",
-                "Distinction": "#059492"
-            }
-            fig = px.bar(df,
-                x=x_col,
-                y='Model',
-                color=df['Test Result'],
-                color_discrete_map=color_discrete_map,
-                labels={x_col: plot_column, 'Model': 'Model'},
-                title=title,
-                orientation='h')
         fig.update_layout(
             xaxis=dict(range=[0, x_range_max]),
@@ -74,30 +64,23 @@ def create_bar_chart(exam_type, plot_column):
     elif exam_type == "MMLU-Pro-Hy":
         df = pd.read_csv('mmlu_pro_hy_results.csv')
-        df = df.sort_values(by='Accuracy', ascending=False)
-        x_col = 'Accuracy'
-        title = 'Accuracy per Model (MMLU-Pro-Hy)'
-        range_max = 1.0
         x_range_max = 1.0
-        if plot_column != 'Accuracy':
-            def get_label(accuracy):
-                if accuracy < 0.5:
-                    return "Low"
-                elif 0.5 <= accuracy <= 0.8:
-                    return "Medium"
-                else:
-                    return "High"
-            df['Test Result'] = df['Accuracy'].apply(get_label)
         fig = px.bar(df,
-                                x=x_col,
-                                y='Model',
-                                color=x_col,
-                                color_continuous_scale='tealrose_r',
-                                labels={x_col: plot_column, 'Model': 'Model'},
-                                title=title,
-                                orientation='h',
-                                range_color=[0, range_max])
         fig.update_layout(
             xaxis=dict(range=[0, x_range_max]),
@@ -112,11 +95,21 @@ def create_bar_chart(exam_type, plot_column):
 with gr.Blocks() as app:
     with gr.Tabs():
         with gr.TabItem("Armenian Unified Exams"):
             table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
-            plot_column_dropdown = gr.Dropdown(choices=['Average score', 'Armenian language exam score', 'Armenian history exam score', 'Mathematics exam score'], value='Average score', label='Select Column to Plot')
             plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
         with gr.TabItem("MMLU-Pro-Hy"):
             table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
-            plot_output_mmlu = gr.Plot(lambda: create_bar_chart("MMLU-Pro-Hy", 'Accuracy'))
-app.launch(share=True)

 def display_table(exam_type):
     if exam_type == "Armenian Exams":
         df = pd.read_csv('unified_exam_results.csv')
+        df = df.sort_values(by='Average', ascending=False)
         cols = df.columns.tolist()
+        cols.insert(1, cols.pop(cols.index('Average')))
         df = df[cols]
     elif exam_type == "MMLU-Pro-Hy":
         df = pd.read_csv('mmlu_pro_hy_results.csv')
+        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
+        df['Average'] = df[subject_cols].mean(axis=1)
+        df = df.sort_values(by='Average', ascending=False)
+        cols = df.columns.tolist()
+        cols.remove('Accuracy')
+        cols.insert(1, cols.pop(cols.index('Average')))
+        cols.append(cols.pop(cols.index('Other')))
+        df = df[cols]
     return df
 def create_bar_chart(exam_type, plot_column):
     if exam_type == "Armenian Exams":
         df = pd.read_csv('unified_exam_results.csv')
         df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
         x_col = plot_column
+        title = f'{plot_column}'
+        x_range_max = 20
+        def get_label(score):
+            if score < 8:
+                return "Fail"
+            elif 8 <= score <= 18:
+                return "Pass"
+            else:
+                return "Distinction"
+        df['Test Result'] = df[plot_column].apply(get_label)
+        color_discrete_map = {
+            "Fail": "#ff5f56",
+            "Pass": "#ffbd2e",
+            "Distinction": "#27c93f"
+        }
+        fig = px.bar(df,
+            x=x_col,
+            y='Model',
+            color=df['Test Result'],
+            color_discrete_map=color_discrete_map,
+            labels={x_col: 'Score', 'Model': 'Model'},
+            title=title,
+            orientation='h')
         fig.update_layout(
             xaxis=dict(range=[0, x_range_max]),
     elif exam_type == "MMLU-Pro-Hy":
         df = pd.read_csv('mmlu_pro_hy_results.csv')
+        subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology']
+        df['Average'] = df[subject_cols].mean(axis=1)
+        df = df.sort_values(by='Average', ascending=False)
+        df = df.drop(columns=['Accuracy'])
+        x_col = plot_column
+        title = f'{plot_column}'
         x_range_max = 1.0
         fig = px.bar(df,
+            x=x_col,
+            y='Model',
+            color=x_col,
+            color_continuous_scale='Viridis',
+            labels={x_col: 'Accuracy', 'Model': 'Model'},
+            title=title,
+            orientation='h',
+            range_color=[0,1])
         fig.update_layout(
             xaxis=dict(range=[0, x_range_max]),
 with gr.Blocks() as app:
     with gr.Tabs():
         with gr.TabItem("Armenian Unified Exams"):
+            gr.Markdown("# Armenian Unified Test Exams")
+            gr.Markdown("### This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.")
+            # gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները հայերեն լեզվի և գրականության, հայոց պատմության և մաթեմատիկայի միասնական քնությունների թեսթերի համար։ Գնահատման համակարգը 20 բալանոց սանդղակ է, որտեղ 0-8-ը նշանակում է Անբավարար, 8-18-ը՝ Բավարար, իսկ 18-20-ը՝ Գերազանց:")
             table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
+            plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
             plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
         with gr.TabItem("MMLU-Pro-Hy"):
+            gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
+            gr.Markdown("### This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.")
+            # gr.Markdown("### Այս աղյուսակը պարունակում է տարբեր լեզվական մոդելների արդյունքները MMLU-Pro թեսթի համար, որը թարգմանվել է հայերեն: MMLU-Pro-ն իրենից ներկայացնում է : Միավորները ներկայացնում են ճշգրտությունը:")
             table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
+            subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology', 'Average']
+            plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
+            plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu)
+app.launch(share=True, debug=True)

mmlu_pro_hy_results.csv CHANGED Viewed

@@ -1,5 +1,8 @@
-Model,Accuracy
-claude-3-5-haiku-20241022,0.526
-claude-3-5-sonnet-20241022,0.701
-gemini-2.0-flash,0.741
-gemini-1.5-flash,0.586

+Model,Accuracy,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Health,History,Law,Math,Other,Philosophy,Physics,Psychology
+gpt-4o,0.685,0.8667,0.7424,0.6842,0.6176,0.7887,0.5625,0.7794,0.5517,0.5393,0.7788,0.5974,0.5476,0.6881,0.7164
+claude-3-5-haiku-20241022,0.522,0.75,0.5758,0.5579,0.4412,0.6901,0.4125,0.5882,0.5172,0.2472,0.6018,0.3636,0.4048,0.5596,0.5672
+claude-3-5-sonnet-20241022,0.701,0.8667,0.803,0.7579,0.7059,0.7887,0.5625,0.6618,0.6552,0.4944,0.7788,0.6494,0.5476,0.7523,0.7164
+DeepSeek-V3,0.672,0.8167,0.8182,0.6947,0.7353,0.7887,0.5875,0.6471,0.4828,0.3596,0.8584,0.5455,0.5476,0.6881,0.7164
+gemini-1.5-flash,0.579,0.75,0.7121,0.6947,0.5,0.7183,0.4,0.5,0.4483,0.2584,0.8319,0.3506,0.3571,0.6514,0.6567
+gemini-2.0-flash,0.737,0.85,0.8182,0.7895,0.7353,0.8169,0.6,0.75,0.5517,0.5281,0.8673,0.6364,0.6429,0.7982,0.7612
+Meta-Llama-3.3-70B-Instruct,0.523,0.7333,0.5303,0.5895,0.3824,0.6338,0.4875,0.5735,0.4138,0.3146,0.6018,0.3377,0.4524,0.5321,0.6119

unified_exam_results.csv CHANGED Viewed

@@ -1,4 +1,4 @@
-Model,Armenian language exam score,Armenian history exam score,Mathematics exam score,Average score
 claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
 claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
 gemini-2.0-flash,5.5,6.75,17.25,9.83

+Model,Armenian language and literature,Armenian history,Mathematics,Average
 claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
 claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
 gemini-2.0-flash,5.5,6.75,17.25,9.83