Spaces:
Running
Running
File size: 4,382 Bytes
d81b5f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
@st.cache_resource
def pipeline_getter():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('KemmerEdition/my-distill-classifier')
mapping = pd.read_csv('./categories.csv').values.squeeze()
return tokenizer, model, mapping
tokenizer, model, mapping = pipeline_getter()
def predict_article_categories_with_confidence(
text_data,
abstract_text=None,
confidence_level=0.95,
max_categories=9
):
tokenized_input = tokenizer(
text=text_data,
text_pair=abstract_text,
padding=True,
truncation=True,
return_tensors='pt'
)
model_output = model(**tokenized_input)
logits = model_output.logits
probs = torch.sigmoid(logits).detach().numpy().flatten()
sorted_indices = np.argsort(probs)[::-1]
sorted_probs = probs[sorted_indices]
cumulative_probs = np.cumsum(sorted_probs)
selected_indices = []
for i, cum_prob in enumerate(cumulative_probs):
if cum_prob >= confidence_level or i >= max_categories - 1:
selected_indices = sorted_indices[:i+1]
break
result = {
'probabilities': probs,
'predicted_categories': [mapping[idx] for idx in selected_indices],
'confidence': cumulative_probs[len(selected_indices)-1],
'top_category': mapping[sorted_indices[0]],
'used_categories': len(selected_indices)
}
return result
st.markdown("""
<style>
.header {
font-size: 36px !important;
color: #1f77b4;
margin-bottom: 20px;
}
.input-box {
background-color: #f0f2f6;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
}
.result-box {
background-color: #e6f3ff;
padding: 20px;
border-radius: 10px;
margin-top: 20px;
}
.category-badge {
display: inline-block;
background-color: #1f77b4;
color: white;
padding: 5px 10px;
margin: 5px;
border-radius: 15px;
font-size: 14px;
}
</style>
""", unsafe_allow_html=True)
st.markdown('<div class="header">Classificator of Paper from arxiv</div>', unsafe_allow_html=True)
with st.container():
st.markdown('<div class="input-box">', unsafe_allow_html=True)
title_input = st.text_input('**Here you can write title:**', placeholder="e.g. Quantum Machine Learning Approaches")
abstract_input = st.text_area('**Here you can write summary from arxiv:**',
placeholder="Paste the abstract here for more accurate categorization...",
height=150)
st.markdown('</div>', unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
confidence_level = st.slider('**Confidence level (%)**', 80, 100, 95)
with col2:
max_categories = st.slider('**Maximum categories**', 1, 10, 3)
if st.button('**Press F (just press)**', type="primary"):
if len(title_input) > 0:
with st.spinner('Analyzing paper content...'):
result = predict_article_categories_with_confidence(
title_input,
abstract_input if abstract_input else None,
confidence_level=confidence_level/100,
max_categories=max_categories
)
with st.container():
st.markdown('<div class="result-box">', unsafe_allow_html=True)
st.subheader("Categorization Results")
st.markdown(f"**Most likely category:**")
st.markdown(f'<div class="category-badge">{result["top_category"]} (p={result["probabilities"][np.argmax(result["probabilities"])]:.3f})</div>',
unsafe_allow_html=True)
if len(result["predicted_categories"]) > 1:
st.markdown(f"Additional categories:")
for category in result["predicted_categories"][1:]:
st.markdown(f'<div class="category-badge">{category}</div>', unsafe_allow_html=True)
st.markdown("---")
else:
st.warning("Please enter at least the paper title") |