Spaces:

Fanny1366
/

ISOM5240_Project

Running

App Files Files Community

Fanny1366 commited on Mar 27

Commit

6c6da50

verified ·

1 Parent(s): 289877f

Create app.py

Browse files

Files changed (1) hide show

app.py +221 -0

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# The first pipeline: Sentiment analysis
+!pip uninstall -y wandb     # avoid experiment tracking
+!pip install transformers[torch] -q
+!pip install dataset -q
+!pip install evaluate -q
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, pipeline
+from datasets import Dataset
+from evaluate import load
+import numpy as np
+import torch
+# Step 1: Define your dataset
+# Replace with your actual dataset
+train_data = {
+    "Review": [
+        "This product is excellent, I love it!",
+        "Terrible experience, would not recommend.",
+        "It's okay, not great, but not bad either."
+    ],
+    "labels": [2, 0, 1]  # Assuming 0=Very Negative, 1=Negative, 2=Neutral, 3=Positive, 4=Very Positive
+}
+eval_data = {
+    "Review": [
+        "Amazing quality, worth the price!",
+        "Awful, completely disappointed."
+    ],
+    "labels": [4, 0]
+}
+# Convert datasets to Hugging Face Dataset format
+small_train_dataset = Dataset.from_dict(train_data)
+small_eval_dataset = Dataset.from_dict(eval_data)
+# Step 2: Load the model and tokenizer
+model = AutoModelForSequenceClassification.from_pretrained(
+    "tabularisai/multilingual-sentiment-analysis",
+    num_labels=5  # Ensure this matches the number of labels in your dataset
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "tabularisai/multilingual-sentiment-analysis"
+)
+# Step 3: Tokenize the datasets
+def tokenize_function(examples):
+    return tokenizer(examples["Review"], padding="max_length", truncation=True, max_length=128)
+tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
+tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)
+# Ensure the datasets have the "labels" column renamed correctly
+tokenized_train = tokenized_train.rename_column("labels", "label")
+tokenized_eval = tokenized_eval.rename_column("labels", "label")
+# Step 4: Define compute metrics function
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    metric = load("accuracy")
+    return metric.compute(predictions=predictions, references=labels)
+# Step 5: Configure training arguments
+training_args = TrainingArguments(
+    output_dir="test_trainer",
+    num_train_epochs=1,  # Increased epochs for better learning
+    per_device_train_batch_size=4,  # Adjust batch size based on available GPU memory
+    evaluation_strategy="epoch",  # Evaluate after each epoch
+    save_strategy="no",  # Avoid saving checkpoints for simplicity
+    learning_rate=5e-5,  # Fine-tuned learning rate
+    logging_dir="logs",  # Log directory
+    seed=42  # Ensure reproducibility
+)
+# Step 6: Set up the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_eval,
+    compute_metrics=compute_metrics
+)
+# Debug: Ensure reproducibility
+np.random.seed(42)
+torch.manual_seed(42)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(42)
+# Step 7: Train and Evaluate
+try:
+    print("Training the model...")
+    trainer.train()  # Train the model
+    print("Evaluating the model...")
+    eval_results = trainer.evaluate()  # Evaluate the model
+    print("Evaluation Results:", eval_results)
+except RuntimeError as e:
+    print("RuntimeError occurred:", str(e))
+# Step 8: Use pipeline for quick testing
+print("\nPipeline Testing:")
+sentiment_pipeline = pipeline(
+    "text-classification",
+    model=model,
+    tokenizer=tokenizer
+)
+# Example test case
+text = "No commercials, and no adds no need for wifi it can use the satellite radio station to pick up or at least that's how it looks"
+result = sentiment_pipeline(text)
+# Correct label map based on model outputs
+label_map = {
+    "Very Negative": 0,
+    "Negative": 1,
+    "Neutral": 2,
+    "Positive": 3,
+    "Very Positive": 4
+}
+# Map the predicted label to its numeric equivalent
+predicted_label = label_map[result[0]['label']]
+confidence = result[0]['score']
+print(f"Text: {text}")
+print(f"Predicted label: {predicted_label} ({result[0]['label']})")
+print(f"Confidence: {confidence:.4f}")
+# Batch Testing
+examples = [
+    {"text": "The stock market showed a strong recovery today.", "label": 4},
+    {"text": "The company's performance is a disaster!", "label": 0},
+    {"text": "It's a stable investment with consistent returns.", "label": 2}
+]
+print("\nBatch Testing:")
+for example in examples:
+    result = sentiment_pipeline(example["text"])
+    predicted_label = label_map[result[0]['label']]  # Map the model's output
+    print(f"Text: {example['text'][:50]}...")
+    print(f"True: {example['label']} | Predicted: {predicted_label} ({result[0]['label']}) | Confidence: {result[0]['score']:.2f}")
+    print("-" * 60)
+# The second pipeline: Text Extraction
+# Installation
+!pip install transformers keybert python-dotenv -q
+from transformers import pipeline
+from keybert import KeyBERT
+from collections import defaultdict
+import re
+# Initialize the model (the first run will download automatically)
+kw_model = KeyBERT()  # Keyword extraction
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")  # Requirement classification
+# Sample Spotify review data
+reviews = [
+    {"text": "Love the Discover Weekly feature but ads are too frequent.", "rating": 4},
+    {"text": "App crashes every time I save a playlist.", "rating": 1},
+    {"text": "Please add a sleep timer option!", "rating": 5},
+    {"text": "Lyrics are out of sync with the music.", "rating": 2},
+]
+# Predefined requirements category tags
+demand_labels = [
+    "feature request",  # Function Request
+    "bug report",       # Question feedback
+    "content issue",    # Content issues (e.g., lyrics)
+    "subscription",     # Subscription related
+    "general feedback"  # General feedback
+]
+def analyze_reviews(reviews):
+    results = []
+    for review in reviews:
+        text = review["text"]
+        rating = review["rating"]
+       # 1. Keyword Extraction (KeyBERT)
+        keywords = kw_model.extract_keywords(
+            text,
+            keyphrase_ngram_range=(1, 2),  # Extract 1-2 word combinations
+            stop_words="english",          # Filter stop words
+            top_n=3                        # Return the first 3 keywords
+        )
+        keywords = [kw[0] for kw in keywords]  # Extract keyword text
+        #2. Requirement Classification (Zero-shot)
+        demand_result = classifier(text, demand_labels)
+        primary_demand = demand_result["labels"][0]  # Most likely type of requirement
+        # 3. Adjust the urgency according to the rating
+        urgency = "low"
+        if rating <= 2:
+            urgency = "high"
+        elif rating <= 4:
+            urgency = "medium"
+        # Structured results
+        results.append({
+            "text": text,
+            "rating": rating,
+            "keywords": keywords,
+            "demand_type": primary_demand,
+            "urgency": urgency
+        })
+    return results
+# Execution analysis
+analysis_results = analyze_reviews(reviews)
+# Print
+for i, result in enumerate(analysis_results, 1):
+    print(f"\nReview {i}:")
+    print(f"Text: {result['text']}")
+    print(f"Rating: {result['rating']}/5")
+    print(f"Keywords: {', '.join(result['keywords'])}")
+    print(f"Demand Type: {result['demand_type']}")
+    print(f"Urgency: {result['urgency']}")