Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
import torch.nn as nn | |
from transformers import AutoTokenizer, AutoModel | |
# โ Setup device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# โ Load tokenizer from repo files | |
tokenizer = AutoTokenizer.from_pretrained(".") | |
# โ Define ScoringModel with safe DeBERTa load | |
class ScoringModel(nn.Module): | |
def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242): | |
super().__init__() | |
self.base = AutoModel.from_pretrained( | |
base_model_name, | |
torch_dtype=torch.float32, # ensure weights are initialized | |
low_cpu_mem_usage=False # force full model load (avoid meta tensors) | |
) | |
self.base.gradient_checkpointing_enable() | |
self.dropout1 = nn.Dropout(dropout_rate) | |
self.dropout2 = nn.Dropout(dropout_rate) | |
self.dropout3 = nn.Dropout(dropout_rate) | |
self.classifier = nn.Linear(self.base.config.hidden_size, 1) | |
def forward(self, input_ids, attention_mask): | |
hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0] | |
logits = (self.classifier(self.dropout1(hidden)) + | |
self.classifier(self.dropout2(hidden)) + | |
self.classifier(self.dropout3(hidden))) / 3 | |
return logits | |
# โ Load model and weights safely | |
model = ScoringModel() | |
model.load_state_dict(torch.load("scoring_model.pt", map_location=device)) | |
model.to(device) | |
model.eval() | |
# โ Streamlit UI setup | |
st.set_page_config(page_title="๐ง LLM Response Evaluator", page_icon="๐", layout="wide") | |
st.markdown("<h1 style='text-align: center;'>๐ง LLM Response Evaluator</h1>", unsafe_allow_html=True) | |
st.markdown("---") | |
# โ Sidebar info | |
with st.sidebar: | |
st.header("โน๏ธ About") | |
st.markdown(""" | |
This app evaluates *which AI response is better* given a prompt. | |
- Enter a **prompt** and two **responses** | |
- The model predicts **which one is higher quality** | |
Powered by a fine-tuned **DeBERTa-v3-small** model ๐ | |
""") | |
# โ Input form | |
col1, col2 = st.columns(2) | |
with col1: | |
prompt = st.text_area("๐ Enter the Prompt", height=150) | |
with col2: | |
st.markdown("<br>", unsafe_allow_html=True) | |
st.markdown("๐ Provide two possible responses below:") | |
response_a = st.text_area("โ๏ธ Response A", height=100) | |
response_b = st.text_area("โ๏ธ Response B", height=100) | |
# โ Prediction | |
if st.button("๐ Evaluate Responses"): | |
if prompt and response_a and response_b: | |
text_a = f"Prompt: {prompt} [SEP] {response_a}" | |
text_b = f"Prompt: {prompt} [SEP] {response_b}" | |
encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186) | |
encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186) | |
encoded_a = { | |
"input_ids": encoded_a["input_ids"].to(device), | |
"attention_mask": encoded_a["attention_mask"].to(device) | |
} | |
encoded_b = { | |
"input_ids": encoded_b["input_ids"].to(device), | |
"attention_mask": encoded_b["attention_mask"].to(device) | |
} | |
with torch.no_grad(): | |
score_a = model(**encoded_a).squeeze() | |
score_b = model(**encoded_b).squeeze() | |
prob_a = torch.sigmoid(score_a).item() | |
prob_b = torch.sigmoid(score_b).item() | |
st.subheader("๐ฎ Prediction Result") | |
if prob_b > prob_a: | |
st.success(f"โ *Response B is better!* (Confidence: {prob_b:.4f})") | |
else: | |
st.success(f"โ *Response A is better!* (Confidence: {prob_a:.4f})") | |
mcol1, mcol2 = st.columns(2) | |
mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}") | |
mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}") | |
st.markdown("---") | |
st.subheader("๐ Confidence Comparison") | |
st.bar_chart({"Confidence": [prob_a, prob_b]}) | |
else: | |
st.warning("โ ๏ธ Please fill in *all fields* before evaluating!") | |