Spaces:

nickusan
/

NlpDev

Sleeping

App Files Files Community

Nikita Pogadaev commited on Apr 7

Commit

c2c8638

1 Parent(s): 6da6312

adding model runner, first commit

Browse files

Files changed (3) hide show

app.py +174 -0
model_info/label_to_theme.json +1 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/usr/bin/python3
+import streamlit as st
+import json
+import numpy as np
+import torch
+from transformers import (
+    DebertaV2Config,
+    DebertaV2Model,
+    DebertaV2Tokenizer,
+)
+model_name = "microsoft/deberta-v3-base"
+tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
+def preprocess_text(text, tokenizer, max_length=512):
+    inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=max_length,
+        return_tensors="pt"
+    )
+    return inputs
+def classify_text(text, model, tokenizer, device, threshold=0.5):
+    inputs = preprocess_text(text, tokenizer)
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(input_ids, attention_mask)
+    probs = torch.sigmoid(logits)
+    predictions = (probs > threshold).int().numpy()
+    return probs.numpy(), predictions
+def get_themes(text, model, tokenizer, label_to_theme, device, limit=5):
+    probabilities, _ = classify_text(text, model, tokenizer, device)
+    probabilities = probabilities / probabilities.sum()
+    themes = []
+    for label in probabilities[0].argsort()[-limit:]:
+        themes.append((label_to_theme[str(label)], probabilities[0][label]))
+    return themes
+class DebertPaperClassifier(torch.nn.Module):
+    def __init__(self, num_labels, device, dropout_rate=0.1, class_weights=None):
+        super().__init__()
+        self.config = DebertaV2Config.from_pretrained(model_name)
+        self.deberta = DebertaV2Model.from_pretrained(model_name, config=self.config)
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(self.config.hidden_size, 512),
+            torch.nn.LayerNorm(512),
+            torch.nn.GELU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(512, num_labels)
+        )
+        self._init_weights()
+        if class_weights is not None:
+            self.loss_fct = torch.nn.BCEWithLogitsLoss(weight=class_weights.to(device))
+        else:
+            self.loss_fct = torch.nn.BCEWithLogitsLoss()
+class DebertPaperClassifierV5(torch.nn.Module):
+    def __init__(self, device, num_labels=47, dropout_rate=0.1, class_weights=None):
+        super().__init__()
+        self.config = DebertaV2Config.from_pretrained("microsoft/deberta-v3-base")
+        self.deberta = DebertaV2Model.from_pretrained("microsoft/deberta-v3-base", config=self.config)
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(self.config.hidden_size, 512),
+            torch.nn.LayerNorm(512),
+            torch.nn.GELU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(512, num_labels)
+        )
+        if class_weights is not None:
+            self.loss_fct = torch.nn.BCEWithLogitsLoss(weight=class_weights.to(device))
+        else:
+            self.loss_fct = torch.nn.BCEWithLogitsLoss()
+    def forward(self, input_ids, attention_mask, labels=None):
+        outputs = self.deberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_fct(logits, labels)
+        return (loss, logits) if loss is not None else logits
+    def _init_weights(self):
+        for module in self.classifier.modules():
+            if isinstance(module, torch.nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=0.02)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+    def forward(self,
+                input_ids,
+                attention_mask,
+                labels=None,
+               ):
+        outputs = self.deberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        cls_output = outputs.last_hidden_state[:, 0, :]
+        logits = self.classifier(cls_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_fct(logits, labels)
+        return (loss, logits) if loss is not None else logits
+@st.cache_resource
+def load_model():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    with open('model_info/label_to_theme.json', 'r') as f:
+        label_to_theme = json.load(f)
+    model = DebertPaperClassifierV5(device=device, num_labels=len(label_to_theme)).to(device)
+    model.load_state_dict(torch.load("model_info/deberta_v3.pth", map_location=device))
+    return model, tokenizer, label_to_theme, device
+def kek():
+    st.title("arXiv Paper Classifier")
+    st.markdown("""
+    <style>
+    .image-row {
+        display: flex;
+        flex-direction: row;
+        gap: 10px;
+    }
+    </style>
+    <div class="image-row">
+    <img width=100px src='https://storage.yandexcloud.net/lms-vault/media/cache/c9/a7/c9a754ba1b2bb5b34e1f178d4ec26f24.jpg'>
+    <img width=300px src='https://pic.rutubelist.ru/video/ba/b6/bab6ab515c15837e28eb6c99df192cae.jpg'>
+    </div>
+    """, unsafe_allow_html=True)
+    st.write("write the title or abstract to classify topic theme")
+    title = st.text_input("title")
+    abstract = st.text_area("abstract")
+    lim = int(st.number_input("top ? themes"))
+    if st.button("CLASSIFY"):
+        if not title and not abstract:
+            st.warning("empty abstract!!!")
+            return
+        text = f"{title}\n\n{abstract}" if title and abstract else title or abstract
+        model, tokenizer, label_to_theme, device = load_model()
+        with st.spinner("classifying..."):
+            themes = get_themes(text, model, tokenizer, label_to_theme, device, lim)
+        co = 0
+        st.success(f"top {int(lim)} results:")
+        for th, pr in themes:
+            st.write(f"{lim - co}. - {th}: {pr:.1%}")
+            co += 1
+if __name__ == "__main__":
+    kek()

model_info/label_to_theme.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": "cs.AI", "1": "physics.soc-ph", "2": "stat.ML", "3": "cs.CE", "4": "cs.DB", "5": "cs.CL", "6": "cs.NA", "7": "cs.CY", "8": "cs.GT", "9": "cs.SI", "10": "stat.AP", "11": "cs.DL", "12": "math.ST", "13": "nlin.AO", "14": "cs.LO", "15": "cs.MM", "16": "cond-mat.dis-nn", "17": "cs.DM", "18": "cs.CC", "19": "stat.CO", "20": "cs.DC", "21": "cs.IT", "22": "cs.DS", "23": "cs.SY", "24": "q-bio.QM", "25": "cs.PL", "26": "cs.RO", "27": "cs.NE", "28": "cs.CR", "29": "cs.MA", "30": "q-bio.NC", "31": "cs.LG", "32": "cs.GR", "33": "physics.data-an", "34": "quant-ph", "35": "cs.IR", "36": "math.NA", "37": "math.PR", "38": "stat.ME", "39": "cs.SE", "40": "math.OC", "41": "math.IT", "42": "cs.HC", "43": "stat.TH", "44": "cs.NI", "45": "cs.CV", "46": "cs.SD"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+torch
+transformers
+numpy
+sentencepiece