Spaces:
Sleeping
Sleeping
File size: 4,101 Bytes
2891ca3 896157d a502454 896157d 98387d7 5ae0fb5 896157d 98387d7 8aba9cf 896157d 98387d7 a502454 8aba9cf 98387d7 a502454 98387d7 a502454 98387d7 5ae0fb5 cc65161 9399894 e9c253a 9399894 98387d7 9399894 5ae0fb5 8aba9cf cc65161 8aba9cf cc65161 9399894 98387d7 9399894 cc65161 9399894 e9c253a cc65161 bb5fd62 e9c253a 5ae0fb5 9399894 98387d7 e9c253a 9399894 5ae0fb5 9399894 5ae0fb5 cc65161 9399894 5ae0fb5 9399894 5ae0fb5 9399894 5ae0fb5 90b9df2 5ae0fb5 e9c253a 5ae0fb5 e9c253a 9399894 5ae0fb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import streamlit as st
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
# โ
Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# โ
Load tokenizer from repo files
tokenizer = AutoTokenizer.from_pretrained(".")
# โ
Define ScoringModel with safe DeBERTa load
class ScoringModel(nn.Module):
def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
super().__init__()
self.base = AutoModel.from_pretrained(
base_model_name,
torch_dtype=torch.float32, # ensure weights are initialized
low_cpu_mem_usage=False # force full model load (avoid meta tensors)
)
self.base.gradient_checkpointing_enable()
self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)
self.dropout3 = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(self.base.config.hidden_size, 1)
def forward(self, input_ids, attention_mask):
hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
logits = (self.classifier(self.dropout1(hidden)) +
self.classifier(self.dropout2(hidden)) +
self.classifier(self.dropout3(hidden))) / 3
return logits
# โ
Load model and weights safely
model = ScoringModel()
model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
model.to(device)
model.eval()
# โ
Streamlit UI setup
st.set_page_config(page_title="๐ง LLM Response Evaluator", page_icon="๐", layout="wide")
st.markdown("<h1 style='text-align: center;'>๐ง LLM Response Evaluator</h1>", unsafe_allow_html=True)
st.markdown("---")
# โ
Sidebar info
with st.sidebar:
st.header("โน๏ธ About")
st.markdown("""
This app evaluates *which AI response is better* given a prompt.
- Enter a **prompt** and two **responses**
- The model predicts **which one is higher quality**
Powered by a fine-tuned **DeBERTa-v3-small** model ๐
""")
# โ
Input form
col1, col2 = st.columns(2)
with col1:
prompt = st.text_area("๐ Enter the Prompt", height=150)
with col2:
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("๐ Provide two possible responses below:")
response_a = st.text_area("โ๏ธ Response A", height=100)
response_b = st.text_area("โ๏ธ Response B", height=100)
# โ
Prediction
if st.button("๐ Evaluate Responses"):
if prompt and response_a and response_b:
text_a = f"Prompt: {prompt} [SEP] {response_a}"
text_b = f"Prompt: {prompt} [SEP] {response_b}"
encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
encoded_a = {
"input_ids": encoded_a["input_ids"].to(device),
"attention_mask": encoded_a["attention_mask"].to(device)
}
encoded_b = {
"input_ids": encoded_b["input_ids"].to(device),
"attention_mask": encoded_b["attention_mask"].to(device)
}
with torch.no_grad():
score_a = model(**encoded_a).squeeze()
score_b = model(**encoded_b).squeeze()
prob_a = torch.sigmoid(score_a).item()
prob_b = torch.sigmoid(score_b).item()
st.subheader("๐ฎ Prediction Result")
if prob_b > prob_a:
st.success(f"โ
*Response B is better!* (Confidence: {prob_b:.4f})")
else:
st.success(f"โ
*Response A is better!* (Confidence: {prob_a:.4f})")
mcol1, mcol2 = st.columns(2)
mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
st.markdown("---")
st.subheader("๐ Confidence Comparison")
st.bar_chart({"Confidence": [prob_a, prob_b]})
else:
st.warning("โ ๏ธ Please fill in *all fields* before evaluating!")
|