File size: 4,382 Bytes
d81b5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


@st.cache_resource
def pipeline_getter():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained('KemmerEdition/my-distill-classifier')
    mapping = pd.read_csv('./categories.csv').values.squeeze()
    return tokenizer, model, mapping


tokenizer, model, mapping = pipeline_getter()


def predict_article_categories_with_confidence(
    text_data, 
    abstract_text=None, 
    confidence_level=0.95,
    max_categories=9
):
    tokenized_input = tokenizer(
        text=text_data,
        text_pair=abstract_text,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    
    model_output = model(**tokenized_input)
    logits = model_output.logits
    probs = torch.sigmoid(logits).detach().numpy().flatten()
    
    sorted_indices = np.argsort(probs)[::-1]
    sorted_probs = probs[sorted_indices]
    
    cumulative_probs = np.cumsum(sorted_probs)
    
    selected_indices = []
    for i, cum_prob in enumerate(cumulative_probs):
        if cum_prob >= confidence_level or i >= max_categories - 1:
            selected_indices = sorted_indices[:i+1]
            break

    result = {
        'probabilities': probs,
        'predicted_categories': [mapping[idx] for idx in selected_indices],
        'confidence': cumulative_probs[len(selected_indices)-1],
        'top_category': mapping[sorted_indices[0]],
        'used_categories': len(selected_indices)
    }
    
    return result


st.markdown("""
<style>
    .header {
        font-size: 36px !important;
        color: #1f77b4;
        margin-bottom: 20px;
    }
    .input-box {
        background-color: #f0f2f6;
        padding: 20px;
        border-radius: 10px;
        margin-bottom: 20px;
    }
    .result-box {
        background-color: #e6f3ff;
        padding: 20px;
        border-radius: 10px;
        margin-top: 20px;
    }
    .category-badge {
        display: inline-block;
        background-color: #1f77b4;
        color: white;
        padding: 5px 10px;
        margin: 5px;
        border-radius: 15px;
        font-size: 14px;
    }
</style>
""", unsafe_allow_html=True)

st.markdown('<div class="header">Classificator of Paper from arxiv</div>', unsafe_allow_html=True)

with st.container():
    st.markdown('<div class="input-box">', unsafe_allow_html=True)
    title_input = st.text_input('**Here you can write title:**', placeholder="e.g. Quantum Machine Learning Approaches")
    abstract_input = st.text_area('**Here you can write summary from arxiv:**',
                                placeholder="Paste the abstract here for more accurate categorization...",
                                height=150)
    st.markdown('</div>', unsafe_allow_html=True)

    col1, col2 = st.columns(2)
    with col1:
        confidence_level = st.slider('**Confidence level (%)**', 80, 100, 95)
    with col2:
        max_categories = st.slider('**Maximum categories**', 1, 10, 3)

if st.button('**Press F (just press)**', type="primary"):
    if len(title_input) > 0:
        with st.spinner('Analyzing paper content...'):
            result = predict_article_categories_with_confidence(
                title_input,
                abstract_input if abstract_input else None,
                confidence_level=confidence_level/100,
                max_categories=max_categories
            )
        
        with st.container():
            st.markdown('<div class="result-box">', unsafe_allow_html=True)
            st.subheader("Categorization Results")
            
            st.markdown(f"**Most likely category:**")
            st.markdown(f'<div class="category-badge">{result["top_category"]} (p={result["probabilities"][np.argmax(result["probabilities"])]:.3f})</div>', 
                       unsafe_allow_html=True)
            
            if len(result["predicted_categories"]) > 1:
                st.markdown(f"Additional categories:")
                for category in result["predicted_categories"][1:]:
                    st.markdown(f'<div class="category-badge">{category}</div>', unsafe_allow_html=True)
            
            st.markdown("---")
    else:
        st.warning("Please enter at least the paper title")