llmfinetune / app.py
naa142's picture
Update app.py
98387d7 verified
import streamlit as st
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
# โœ… Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# โœ… Load tokenizer from repo files
tokenizer = AutoTokenizer.from_pretrained(".")
# โœ… Define ScoringModel with safe DeBERTa load
class ScoringModel(nn.Module):
def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
super().__init__()
self.base = AutoModel.from_pretrained(
base_model_name,
torch_dtype=torch.float32, # ensure weights are initialized
low_cpu_mem_usage=False # force full model load (avoid meta tensors)
)
self.base.gradient_checkpointing_enable()
self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)
self.dropout3 = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(self.base.config.hidden_size, 1)
def forward(self, input_ids, attention_mask):
hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
logits = (self.classifier(self.dropout1(hidden)) +
self.classifier(self.dropout2(hidden)) +
self.classifier(self.dropout3(hidden))) / 3
return logits
# โœ… Load model and weights safely
model = ScoringModel()
model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
model.to(device)
model.eval()
# โœ… Streamlit UI setup
st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
st.markdown("---")
# โœ… Sidebar info
with st.sidebar:
st.header("โ„น๏ธ About")
st.markdown("""
This app evaluates *which AI response is better* given a prompt.
- Enter a **prompt** and two **responses**
- The model predicts **which one is higher quality**
Powered by a fine-tuned **DeBERTa-v3-small** model ๐Ÿš€
""")
# โœ… Input form
col1, col2 = st.columns(2)
with col1:
prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
with col2:
st.markdown("<br>", unsafe_allow_html=True)
st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
response_a = st.text_area("โœ๏ธ Response A", height=100)
response_b = st.text_area("โœ๏ธ Response B", height=100)
# โœ… Prediction
if st.button("๐Ÿ” Evaluate Responses"):
if prompt and response_a and response_b:
text_a = f"Prompt: {prompt} [SEP] {response_a}"
text_b = f"Prompt: {prompt} [SEP] {response_b}"
encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
encoded_a = {
"input_ids": encoded_a["input_ids"].to(device),
"attention_mask": encoded_a["attention_mask"].to(device)
}
encoded_b = {
"input_ids": encoded_b["input_ids"].to(device),
"attention_mask": encoded_b["attention_mask"].to(device)
}
with torch.no_grad():
score_a = model(**encoded_a).squeeze()
score_b = model(**encoded_b).squeeze()
prob_a = torch.sigmoid(score_a).item()
prob_b = torch.sigmoid(score_b).item()
st.subheader("๐Ÿ”ฎ Prediction Result")
if prob_b > prob_a:
st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
else:
st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
mcol1, mcol2 = st.columns(2)
mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
st.markdown("---")
st.subheader("๐Ÿ“Š Confidence Comparison")
st.bar_chart({"Confidence": [prob_a, prob_b]})
else:
st.warning("โš ๏ธ Please fill in *all fields* before evaluating!")