rviana commited on
Commit
d8dbb05
·
1 Parent(s): 2e48c0b

Update sentiment analysis app

Browse files
Files changed (1) hide show
  1. app.py +30 -2
app.py CHANGED
@@ -1,19 +1,47 @@
1
  import gradio as gr
2
  from datasets import load_dataset
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch
5
 
6
  # Check if GPU is available
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
9
  # Load the IMDb dataset
10
- dataset = load_dataset('imdb', split='test[:1%]') # Load a small portion for testing
11
 
12
  # Initialize the tokenizer and model
13
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
14
  model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
15
  model.to(device)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Function to classify sentiment
18
  def classify_text(text):
19
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
  import torch
5
 
6
  # Check if GPU is available
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
 
9
  # Load the IMDb dataset
10
+ dataset = load_dataset('imdb')
11
 
12
  # Initialize the tokenizer and model
13
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
14
  model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
15
  model.to(device)
16
 
17
+ # Tokenize the dataset
18
+ def tokenize_function(examples):
19
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
20
+
21
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
22
+
23
+ # Set up training arguments
24
+ training_args = TrainingArguments(
25
+ output_dir="./results",
26
+ evaluation_strategy="epoch",
27
+ learning_rate=2e-5,
28
+ per_device_train_batch_size=16,
29
+ per_device_eval_batch_size=16,
30
+ num_train_epochs=1, # Start with fewer epochs for quicker runs
31
+ weight_decay=0.01,
32
+ )
33
+
34
+ # Initialize the Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)), # Use a subset for quicker runs
39
+ eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000)),
40
+ )
41
+
42
+ # Train the model
43
+ trainer.train()
44
+
45
  # Function to classify sentiment
46
  def classify_text(text):
47
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)