File size: 4,631 Bytes
7e44c6f
 
 
 
 
5d710b8
7e44c6f
 
5d710b8
c52794d
 
7e44c6f
 
 
 
 
 
 
 
 
 
5d710b8
 
 
 
 
 
 
 
7e44c6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
import pandas as pd
import icc  # Importing ICC computation functions

# Set up Streamlit page layout (Full Width)
st.set_page_config(layout="wide", page_title="LLM's Scores Evaluation: ICC Computation", page_icon="πŸ“Š")

# Title and instructions
st.title("πŸ“Š LLM's Scores Evaluation: ICC Computation")
st.markdown("This app supports a scientific study on peer review among Large Language Models (LLMs) (https://arxiv.org/abs/2412.09385). Before computing inter-rater agreement (ICC), responses from multiple LLMs are collected on a forecasting task, and each model is then asked to evaluate all responses using predefined criteria. This tool allows "
"researchers to upload those evaluation scores, filter the data, and analyze model agreement through ICC metrics and heatmaps.")

# **Two Side-by-Side Containers**
container_left, container_right = st.columns([1, 2])  # Left (Filters) | Right (ICC Results + Heatmaps)

# **LEFT: File Upload & Selection Filters**
with container_left:
    st.header("πŸ“‚ Upload & Selection")

    # File uploader
    uploaded_file = st.file_uploader("Upload Your CSV", type=["csv"])
    st.markdown("""
    **File Requirements:**
    - The file must be in **CSV format**.
    - It should contain the following columns:
    - **assessor**: Identifier for the assessor (e.g., evaluator name or ID).
    - **respondent**: Identifier for the respondent (e.g., participant name or ID).
    - **criterion_X**: Columns starting with "criterion" representing evaluation criteria (e.g., criterion_1, criterion_2, etc.).""")
    
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file, delimiter=",", dtype=str)  # Read as string first

        required_columns = ["assessor", "respondent"]
        criterion_columns = [col for col in df.columns if col.startswith("criterion")]

        if not all(col in df.columns for col in required_columns) or len(criterion_columns) < 1:
            st.error("❌ Invalid CSV format.")
        else:
            st.success("βœ… CSV format is valid!")

            # Sidebar filters
            st.subheader("πŸ” Select Filters")

            # Extract Unique Options
            all_assessors = sorted(df["assessor"].unique())
            all_respondents = sorted(df["respondent"].unique())
            all_criteria = criterion_columns

            # **Assessors Selection with 'Select All'**
            select_all_assessors = st.checkbox("Select All Assessors", value=True)
            selected_assessors = st.multiselect(
                "Select Assessors", all_assessors, default=all_assessors if select_all_assessors else []
            )

            # **Respondents Selection with 'Select All'**
            select_all_respondents = st.checkbox("Select All Respondents", value=True)
            selected_respondents = st.multiselect(
                "Select Respondents", all_respondents, default=all_respondents if select_all_respondents else []
            )

            # **Criteria Selection with 'Select All'**
            select_all_criteria = st.checkbox("Select All Criteria", value=True)
            selected_criteria = st.multiselect(
                "Select Criteria", all_criteria, default=all_criteria if select_all_criteria else []
            )

            # Filter data based on user selection
            df = icc.preprocess_data(df, selected_assessors, selected_respondents, selected_criteria)

            if df.empty:
                st.error("⚠️ No data available with selected filters.")

# **RIGHT: Display ICC Results + Heatmaps**
with container_right:
    st.header("πŸ“Š ICC Results & Heatmaps")

    if uploaded_file is not None and not df.empty:
        with st.spinner("⏳ Computing ICC... Please wait."):
            icc_results = icc.compute_icc(df)

        if icc_results is not None:
            st.subheader("πŸ“ˆ Overall ICC Results")
            st.dataframe(icc_results, use_container_width=True)  # Display ICC table
        else:
            st.warning("⚠️ Not enough respondents to compute ICC.")

        # **HEATMAPS: Display Below in 3 Columns**
        st.subheader("πŸ”₯ ICC Heatmaps (Assessor Agreement)")
        heatmap_cols = st.columns(3)  # 3-column layout for heatmaps

        # Compute assessor ICC
        icc_matrix_types = icc.compute_assessor_icc(df)

        # Generate heatmaps and display
        heatmap_files = icc.generate_heatmaps(icc_matrix_types)

        for i, (icc_type, heatmap_file) in enumerate(heatmap_files.items()):
            heatmap_cols[i].image(heatmap_file, caption=f"ICC Heatmap ({icc_type})", use_container_width=True)