Fanny1366 commited on
Commit
6c6da50
·
verified ·
1 Parent(s): 289877f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The first pipeline: Sentiment analysis
2
+ !pip uninstall -y wandb # avoid experiment tracking
3
+ !pip install transformers[torch] -q
4
+ !pip install dataset -q
5
+ !pip install evaluate -q
6
+
7
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, pipeline
8
+ from datasets import Dataset
9
+ from evaluate import load
10
+ import numpy as np
11
+ import torch
12
+
13
+ # Step 1: Define your dataset
14
+ # Replace with your actual dataset
15
+ train_data = {
16
+ "Review": [
17
+ "This product is excellent, I love it!",
18
+ "Terrible experience, would not recommend.",
19
+ "It's okay, not great, but not bad either."
20
+ ],
21
+ "labels": [2, 0, 1] # Assuming 0=Very Negative, 1=Negative, 2=Neutral, 3=Positive, 4=Very Positive
22
+ }
23
+ eval_data = {
24
+ "Review": [
25
+ "Amazing quality, worth the price!",
26
+ "Awful, completely disappointed."
27
+ ],
28
+ "labels": [4, 0]
29
+ }
30
+
31
+ # Convert datasets to Hugging Face Dataset format
32
+ small_train_dataset = Dataset.from_dict(train_data)
33
+ small_eval_dataset = Dataset.from_dict(eval_data)
34
+
35
+ # Step 2: Load the model and tokenizer
36
+ model = AutoModelForSequenceClassification.from_pretrained(
37
+ "tabularisai/multilingual-sentiment-analysis",
38
+ num_labels=5 # Ensure this matches the number of labels in your dataset
39
+ )
40
+ tokenizer = AutoTokenizer.from_pretrained(
41
+ "tabularisai/multilingual-sentiment-analysis"
42
+ )
43
+
44
+ # Step 3: Tokenize the datasets
45
+ def tokenize_function(examples):
46
+ return tokenizer(examples["Review"], padding="max_length", truncation=True, max_length=128)
47
+
48
+ tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
49
+ tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)
50
+
51
+ # Ensure the datasets have the "labels" column renamed correctly
52
+ tokenized_train = tokenized_train.rename_column("labels", "label")
53
+ tokenized_eval = tokenized_eval.rename_column("labels", "label")
54
+
55
+ # Step 4: Define compute metrics function
56
+ def compute_metrics(eval_pred):
57
+ logits, labels = eval_pred
58
+ predictions = np.argmax(logits, axis=-1)
59
+ metric = load("accuracy")
60
+ return metric.compute(predictions=predictions, references=labels)
61
+
62
+ # Step 5: Configure training arguments
63
+ training_args = TrainingArguments(
64
+ output_dir="test_trainer",
65
+ num_train_epochs=1, # Increased epochs for better learning
66
+ per_device_train_batch_size=4, # Adjust batch size based on available GPU memory
67
+ evaluation_strategy="epoch", # Evaluate after each epoch
68
+ save_strategy="no", # Avoid saving checkpoints for simplicity
69
+ learning_rate=5e-5, # Fine-tuned learning rate
70
+ logging_dir="logs", # Log directory
71
+ seed=42 # Ensure reproducibility
72
+ )
73
+
74
+ # Step 6: Set up the Trainer
75
+ trainer = Trainer(
76
+ model=model,
77
+ args=training_args,
78
+ train_dataset=tokenized_train,
79
+ eval_dataset=tokenized_eval,
80
+ compute_metrics=compute_metrics
81
+ )
82
+
83
+ # Debug: Ensure reproducibility
84
+ np.random.seed(42)
85
+ torch.manual_seed(42)
86
+ if torch.cuda.is_available():
87
+ torch.cuda.manual_seed(42)
88
+
89
+ # Step 7: Train and Evaluate
90
+ try:
91
+ print("Training the model...")
92
+ trainer.train() # Train the model
93
+ print("Evaluating the model...")
94
+ eval_results = trainer.evaluate() # Evaluate the model
95
+ print("Evaluation Results:", eval_results)
96
+ except RuntimeError as e:
97
+ print("RuntimeError occurred:", str(e))
98
+
99
+ # Step 8: Use pipeline for quick testing
100
+ print("\nPipeline Testing:")
101
+ sentiment_pipeline = pipeline(
102
+ "text-classification",
103
+ model=model,
104
+ tokenizer=tokenizer
105
+ )
106
+
107
+ # Example test case
108
+ text = "No commercials, and no adds no need for wifi it can use the satellite radio station to pick up or at least that's how it looks"
109
+ result = sentiment_pipeline(text)
110
+
111
+ # Correct label map based on model outputs
112
+ label_map = {
113
+ "Very Negative": 0,
114
+ "Negative": 1,
115
+ "Neutral": 2,
116
+ "Positive": 3,
117
+ "Very Positive": 4
118
+ }
119
+
120
+ # Map the predicted label to its numeric equivalent
121
+ predicted_label = label_map[result[0]['label']]
122
+ confidence = result[0]['score']
123
+
124
+ print(f"Text: {text}")
125
+ print(f"Predicted label: {predicted_label} ({result[0]['label']})")
126
+ print(f"Confidence: {confidence:.4f}")
127
+
128
+ # Batch Testing
129
+ examples = [
130
+ {"text": "The stock market showed a strong recovery today.", "label": 4},
131
+ {"text": "The company's performance is a disaster!", "label": 0},
132
+ {"text": "It's a stable investment with consistent returns.", "label": 2}
133
+ ]
134
+
135
+ print("\nBatch Testing:")
136
+ for example in examples:
137
+ result = sentiment_pipeline(example["text"])
138
+ predicted_label = label_map[result[0]['label']] # Map the model's output
139
+ print(f"Text: {example['text'][:50]}...")
140
+ print(f"True: {example['label']} | Predicted: {predicted_label} ({result[0]['label']}) | Confidence: {result[0]['score']:.2f}")
141
+ print("-" * 60)
142
+
143
+
144
+ # The second pipeline: Text Extraction
145
+ # Installation
146
+ !pip install transformers keybert python-dotenv -q
147
+
148
+
149
+ from transformers import pipeline
150
+ from keybert import KeyBERT
151
+ from collections import defaultdict
152
+ import re
153
+
154
+ # Initialize the model (the first run will download automatically)
155
+ kw_model = KeyBERT() # Keyword extraction
156
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # Requirement classification
157
+
158
+ # Sample Spotify review data
159
+ reviews = [
160
+ {"text": "Love the Discover Weekly feature but ads are too frequent.", "rating": 4},
161
+ {"text": "App crashes every time I save a playlist.", "rating": 1},
162
+ {"text": "Please add a sleep timer option!", "rating": 5},
163
+ {"text": "Lyrics are out of sync with the music.", "rating": 2},
164
+ ]
165
+
166
+ # Predefined requirements category tags
167
+ demand_labels = [
168
+ "feature request", # Function Request
169
+ "bug report", # Question feedback
170
+ "content issue", # Content issues (e.g., lyrics)
171
+ "subscription", # Subscription related
172
+ "general feedback" # General feedback
173
+ ]
174
+
175
+ def analyze_reviews(reviews):
176
+ results = []
177
+ for review in reviews:
178
+ text = review["text"]
179
+ rating = review["rating"]
180
+
181
+ # 1. Keyword Extraction (KeyBERT)
182
+ keywords = kw_model.extract_keywords(
183
+ text,
184
+ keyphrase_ngram_range=(1, 2), # Extract 1-2 word combinations
185
+ stop_words="english", # Filter stop words
186
+ top_n=3 # Return the first 3 keywords
187
+ )
188
+ keywords = [kw[0] for kw in keywords] # Extract keyword text
189
+
190
+ #2. Requirement Classification (Zero-shot)
191
+ demand_result = classifier(text, demand_labels)
192
+ primary_demand = demand_result["labels"][0] # Most likely type of requirement
193
+
194
+ # 3. Adjust the urgency according to the rating
195
+ urgency = "low"
196
+ if rating <= 2:
197
+ urgency = "high"
198
+ elif rating <= 4:
199
+ urgency = "medium"
200
+
201
+ # Structured results
202
+ results.append({
203
+ "text": text,
204
+ "rating": rating,
205
+ "keywords": keywords,
206
+ "demand_type": primary_demand,
207
+ "urgency": urgency
208
+ })
209
+ return results
210
+
211
+ # Execution analysis
212
+ analysis_results = analyze_reviews(reviews)
213
+
214
+ # Print
215
+ for i, result in enumerate(analysis_results, 1):
216
+ print(f"\nReview {i}:")
217
+ print(f"Text: {result['text']}")
218
+ print(f"Rating: {result['rating']}/5")
219
+ print(f"Keywords: {', '.join(result['keywords'])}")
220
+ print(f"Demand Type: {result['demand_type']}")
221
+ print(f"Urgency: {result['urgency']}")