File size: 4,101 Bytes
2891ca3
896157d
a502454
 
896157d
98387d7
5ae0fb5
896157d
98387d7
8aba9cf
896157d
98387d7
a502454
8aba9cf
 
98387d7
 
 
 
 
a502454
 
 
 
 
 
 
 
 
 
 
 
 
98387d7
a502454
98387d7
 
5ae0fb5
 
cc65161
9399894
e9c253a
9399894
 
98387d7
9399894
 
 
5ae0fb5
8aba9cf
cc65161
 
8aba9cf
cc65161
9399894
 
98387d7
9399894
cc65161
9399894
e9c253a
cc65161
bb5fd62
e9c253a
 
5ae0fb5
9399894
 
 
98387d7
e9c253a
9399894
 
 
5ae0fb5
9399894
 
5ae0fb5
cc65161
 
 
 
 
 
 
 
 
9399894
 
 
5ae0fb5
9399894
 
5ae0fb5
9399894
 
5ae0fb5
90b9df2
5ae0fb5
 
e9c253a
 
 
5ae0fb5
e9c253a
 
 
9399894
5ae0fb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# โœ… Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# โœ… Load tokenizer from repo files
tokenizer = AutoTokenizer.from_pretrained(".")

# โœ… Define ScoringModel with safe DeBERTa load
class ScoringModel(nn.Module):
    def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
        super().__init__()
        self.base = AutoModel.from_pretrained(
            base_model_name,
            torch_dtype=torch.float32,         # ensure weights are initialized
            low_cpu_mem_usage=False            # force full model load (avoid meta tensors)
        )
        self.base.gradient_checkpointing_enable()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.dropout3 = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.base.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = (self.classifier(self.dropout1(hidden)) +
                  self.classifier(self.dropout2(hidden)) +
                  self.classifier(self.dropout3(hidden))) / 3
        return logits

# โœ… Load model and weights safely
model = ScoringModel()
model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
model.to(device)
model.eval()

# โœ… Streamlit UI setup
st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
st.markdown("---")

# โœ… Sidebar info
with st.sidebar:
    st.header("โ„น๏ธ About")
    st.markdown("""
    This app evaluates *which AI response is better* given a prompt.

    - Enter a **prompt** and two **responses**
    - The model predicts **which one is higher quality**

    Powered by a fine-tuned **DeBERTa-v3-small** model ๐Ÿš€
    """)

# โœ… Input form
col1, col2 = st.columns(2)

with col1:
    prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)

with col2:
    st.markdown("<br>", unsafe_allow_html=True)
    st.markdown("๐Ÿ‘‰ Provide two possible responses below:")

response_a = st.text_area("โœ๏ธ Response A", height=100)
response_b = st.text_area("โœ๏ธ Response B", height=100)

# โœ… Prediction
if st.button("๐Ÿ” Evaluate Responses"):
    if prompt and response_a and response_b:
        text_a = f"Prompt: {prompt} [SEP] {response_a}"
        text_b = f"Prompt: {prompt} [SEP] {response_b}"

        encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
        encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)

        encoded_a = {
            "input_ids": encoded_a["input_ids"].to(device),
            "attention_mask": encoded_a["attention_mask"].to(device)
        }
        encoded_b = {
            "input_ids": encoded_b["input_ids"].to(device),
            "attention_mask": encoded_b["attention_mask"].to(device)
        }

        with torch.no_grad():
            score_a = model(**encoded_a).squeeze()
            score_b = model(**encoded_b).squeeze()

        prob_a = torch.sigmoid(score_a).item()
        prob_b = torch.sigmoid(score_b).item()

        st.subheader("๐Ÿ”ฎ Prediction Result")
        if prob_b > prob_a:
            st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
        else:
            st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")

        mcol1, mcol2 = st.columns(2)
        mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
        mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")

        st.markdown("---")
        st.subheader("๐Ÿ“Š Confidence Comparison")
        st.bar_chart({"Confidence": [prob_a, prob_b]})
    else:
        st.warning("โš ๏ธ Please fill in *all fields* before evaluating!")