File size: 2,463 Bytes
b6a1553
c942b0f
 
 
3e7c541
c942b0f
 
3e7c541
cb329a5
c942b0f
 
 
 
3e7c541
b6a1553
8a0298f
c942b0f
 
3e7c541
c942b0f
 
3e7c541
2a2af5a
 
 
 
 
 
 
c942b0f
b6a1553
 
8a0298f
b6a1553
 
 
c942b0f
b6a1553
 
c942b0f
b6a1553
 
c942b0f
b6a1553
 
 
c942b0f
65533e2
b6a1553
 
 
c942b0f
 
b6a1553
 
c942b0f
 
8a0298f
c942b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
cb329a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import gradio as gr
import torch

# Schritt 1: Dataset laden und überprüfen
# Falls "KeyError: 'text'" auftritt, Spaltennamen prüfen

dataset = load_dataset("armanc/scientific_papers", "arxiv", trust_remote_code=True)  # Falls du PubMed nutzt, ersetze "arxiv" mit "pubmed"
print(dataset)

# Schritt 2: Tokenizer vorbereiten
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize_function(examples):
    return tokenizer(examples["abstract"], padding="max_length", truncation=True, max_length=151)

dataset = dataset.map(tokenize_function, batched=True)

# Schritt 3: Modell laden
model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=3)

# Anpassung für Trainingsdaten: Label-Spalte hinzufügen
def add_labels(example):
    example["labels"] = 1  # Dummy-Label, falls nicht vorhanden (1=positiv, 0=negativ, 2=neutral o.Ä.)
    return example

dataset = dataset.map(add_labels)

# Schritt 4: Trainingsparameter setzen
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
)

# Schritt 5: Trainer erstellen und Training starten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)
trainer.train()

# Schritt 6: Modell speichern
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

# Schritt 7: Modell für Gradio bereitstellen
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=151)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return {f"Label {i}": float(probabilities[0][i]) for i in range(len(probabilities[0]))}

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=5, placeholder="Paste an abstract here..."),
    outputs=gr.Label(),
    title="Scientific Paper Evaluator",
    description="This AI model scores scientific papers based on relevance, uniqueness, and redundancy."
)

iface.launch()