flytoe commited on
Commit
b6a1553
·
verified ·
1 Parent(s): fd0abbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -13
app.py CHANGED
@@ -1,21 +1,45 @@
1
  import torch
2
- from transformers import AutoTokenizer, AutoModel
 
3
 
4
- # Modell & Tokenizer laden
5
  model_name = "allenai/scibert_scivocab_uncased"
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
- model = AutoModel.from_pretrained(model_name)
8
 
9
- # Explizit auf CPU setzen
10
- device = torch.device("cpu")
11
- model.to(device)
12
 
13
- # Beispiel-Text für den Test
14
- text = "This paper introduces a novel deep learning approach for cancer diagnosis."
 
15
 
16
- # Tokenisierung & Modell-Durchlauf
17
- inputs = tokenizer(text, return_tensors="pt").to(device)
18
- outputs = model(**inputs)
19
 
20
- print("Modell erfolgreich geladen!")
21
- print("Shape der Ausgabe:", outputs.last_hidden_state.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
3
+ from datasets import load_dataset
4
 
5
+ # 1️⃣ Modell & Tokenizer laden
6
  model_name = "allenai/scibert_scivocab_uncased"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # z.B. für 3 Kategorien
9
 
10
+ # 2️⃣ Dataset laden (ersetze mit deinem Dataset)
11
+ dataset = load_dataset("scientific_papers", "arxiv") # Hugging Face Datasets
 
12
 
13
+ # 3️⃣ Tokenisierung der Texte
14
+ def tokenize_function(examples):
15
+ return tokenizer(examples["abstract"], padding="max_length", truncation=True)
16
 
17
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
 
18
 
19
+ # 4️⃣ Trainingsparameter setzen
20
+ training_args = TrainingArguments(
21
+ output_dir="./results",
22
+ evaluation_strategy="epoch",
23
+ save_strategy="epoch",
24
+ per_device_train_batch_size=8,
25
+ per_device_eval_batch_size=8,
26
+ num_train_epochs=3,
27
+ weight_decay=0.01,
28
+ logging_dir="./logs",
29
+ )
30
+
31
+ # 5️⃣ Training starten
32
+ trainer = Trainer(
33
+ model=model,
34
+ args=training_args,
35
+ train_dataset=tokenized_datasets["train"],
36
+ eval_dataset=tokenized_datasets["validation"],
37
+ )
38
+
39
+ trainer.train()
40
+
41
+ # 6️⃣ Speichern des Modells nach dem Training
42
+ model.save_pretrained("./trained_model")
43
+ tokenizer.save_pretrained("./trained_model")
44
+
45
+ print("✅ Training abgeschlossen! Modell gespeichert.")