Spaces:

shresht8
/

sentiment-analysis-excel

Sleeping

App Files Files Community

sentiment-analysis-excel / app.py

shresht8

update app - extract likes/dislikes

50fe41a verified 3 months ago

raw

history blame contribute delete

12.1 kB

	import gradio as gr
	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import plotly.express as px
	import plotly.graph_objects as go
	from collections import defaultdict
	from openai import OpenAI
	from pydantic import BaseModel, Field, field_validator, ValidationInfo
	from typing import Optional, Dict, Any, List, Annotated
	from instructor import patch
	import instructor
	from prompts import sentiments_prompt

	# Load model and tokenizer globally for efficiency
	model_name = "tabularisai/multilingual-sentiment-analysis"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	# Define sentiment weights for score calculation
	SENTIMENT_WEIGHTS = {
	0: 0.0, # Very Negative
	1: 0.25, # Negative
	2: 0.5, # Neutral
	3: 0.75, # Positive
	4: 1.0 # Very Positive
	}

	class ExtractProductSentiment(BaseModel):
	"""Extracts what people like and dislike about a product based on product reviews and sentiment scores (0-100)"""
	product_likes: List[str] = Field(..., description="What people like about the product. List of 3 sentences AT MOST. Must be aggregated in the order of importance.")
	product_dislikes: List[str] = Field(..., description="What people dislike about the product. List of 3 sentences AT MOST. Must be aggregated in the order of importance.")

	@field_validator("product_likes", "product_dislikes")
	def validate_product_likes_and_dislikes(cls, v, info: ValidationInfo):
	if not v:
	raise ValueError(f"At least one {info.field_name} must be provided. If nothing to say, please enter 'None'")

	if len(v) > 3:
	raise ValueError(
	f"{info.field_name} contains {len(v)} points. Please aggregate the points to a maximum of 3 key points "
	"in order of importance. Combine similar points together."
	)
	return v

	def predict_sentiment_with_scores(texts):
	"""
	Predict sentiment for a list of texts and return both class labels and sentiment scores
	"""
	inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
	with torch.no_grad():
	outputs = model(**inputs)

	probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

	# Get predicted classes
	sentiment_map = {
	0: "Very Negative",
	1: "Negative",
	2: "Neutral",
	3: "Positive",
	4: "Very Positive"
	}
	predicted_classes = [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]

	# Calculate sentiment scores (0-100)
	sentiment_scores = []
	for prob in probabilities:
	# Weighted sum of probabilities
	score = sum(prob[i].item() * SENTIMENT_WEIGHTS[i] for i in range(len(prob)))
	# Scale to 0-100
	sentiment_scores.append(round(score * 100, 2))

	return predicted_classes, sentiment_scores

	#patch() # Patch OpenAI client to support response models

	def get_product_sentiment(client, reviews: List[str], scores: List[float]) -> ExtractProductSentiment:
	"""Extract product likes and dislikes using OpenAI"""
	# Combine reviews and scores for context
	review_context = "\n".join([f"Review (Score: {score}): {review}"
	for review, score in zip(reviews, scores)])
	#client = instructor.patch(OpenAI(api_key=openai_api_key))
	prompt = sentiments_prompt.format(review_context=review_context)

	response = client.chat.completions.create(
	model="gpt-4o",
	response_model=ExtractProductSentiment,
	messages=[
	{"role": "system", "content": "You are a helpful product analyst."},
	{"role": "user", "content": prompt}
	],
	max_retries=3
	)
	return response

	def create_comparison_charts(sentiment_results, avg_sentiment_scores):
	"""
	Create comparison charts for sentiment analysis across products
	"""
	# Create summary DataFrame
	summary_data = []
	for product in sentiment_results.keys():
	counts = sentiment_results[product]
	total = counts.sum()
	row = {
	'Product': product,
	'Average Sentiment Score': avg_sentiment_scores[product],
	'Total Reviews': total,
	'Very Positive %': round((counts.get('Very Positive', 0) / total) * 100, 2),
	'Positive %': round((counts.get('Positive', 0) / total) * 100, 2),
	'Neutral %': round((counts.get('Neutral', 0) / total) * 100, 2),
	'Negative %': round((counts.get('Negative', 0) / total) * 100, 2),
	'Very Negative %': round((counts.get('Very Negative', 0) / total) * 100, 2)
	}
	summary_data.append(row)

	summary_df = pd.DataFrame(summary_data)

	# Score comparison chart
	score_comparison_fig = px.bar(
	summary_df,
	x='Product',
	y='Average Sentiment Score',
	title='Average Sentiment Scores by Product',
	labels={'Average Sentiment Score': 'Score (0-100)'}
	)

	# Distribution chart
	distribution_data = []
	for product in sentiment_results.keys():
	counts = sentiment_results[product]
	# Aggregate positive and negative sentiments
	aggregated_counts = {
	'Positive': counts.get('Very Positive', 0) + counts.get('Positive', 0),
	'Neutral': counts.get('Neutral', 0),
	'Negative': counts.get('Very Negative', 0) + counts.get('Negative', 0)
	}
	for sentiment, count in aggregated_counts.items():
	distribution_data.append({
	'Product': product,
	'Sentiment': sentiment,
	'Count': count
	})

	distribution_df = pd.DataFrame(distribution_data)
	distribution_fig = px.bar(
	distribution_df,
	x='Product',
	y='Count',
	color='Sentiment',
	title='Sentiment Distribution by Product',
	barmode='group',
	color_discrete_map={
	'Positive': '#2ECC71', # Green
	'Neutral': '#F1C40F', # Yellow
	'Negative': '#E74C3C' # Red
	}
	)

	# Ratio chart (percentage stacked bar)
	ratio_fig = px.bar(
	distribution_df,
	x='Product',
	y='Count',
	color='Sentiment',
	title='Sentiment Distribution Ratio by Product',
	barmode='relative'
	)

	return score_comparison_fig, distribution_fig, ratio_fig, summary_df

	def process_single_sheet(df, product_name, openai_client):
	"""
	Process a single dataframe and return sentiment analysis results
	"""
	if 'Reviews' not in df.columns:
	raise ValueError(f"'Reviews' column not found in sheet/file for {product_name}")

	reviews = df['Reviews'].fillna("")
	sentiments, scores = predict_sentiment_with_scores(reviews.tolist())

	df['Sentiment'] = sentiments
	df['Sentiment_Score'] = scores

	# Extract product likes and dislikes
	try:
	product_sentiment = get_product_sentiment(openai_client, reviews.tolist(), scores)

	# Initialize empty columns
	df['Likes'] = ""
	df['Dislikes'] = ""

	# Get the likes and dislikes lists
	likes_list = product_sentiment.product_likes
	dislikes_list = product_sentiment.product_dislikes

	# Only populate the first N rows where N is the length of the likes/dislikes lists
	for idx, (like, dislike) in enumerate(zip(likes_list, dislikes_list)):
	df.loc[idx, 'Likes'] = like
	df.loc[idx, 'Dislikes'] = dislike

	except Exception as e:
	print(f"Error extracting likes/dislikes for {product_name}: {str(e)}")
	df['Likes'] = ""
	df['Dislikes'] = ""

	# Calculate sentiment distribution
	sentiment_counts = pd.Series(sentiments).value_counts()
	avg_sentiment_score = round(sum(scores) / len(scores), 2)

	return df, sentiment_counts, avg_sentiment_score

	def process_file(file_obj, api_key):
	"""
	Process the input file and add sentiment analysis results
	"""
	try:
	if not api_key:
	raise ValueError("OpenAI API key is required")

	client = instructor.patch(OpenAI(api_key=api_key))

	file_path = file_obj.name
	sentiment_results = defaultdict(pd.Series)
	avg_sentiment_scores = {}
	all_processed_dfs = {}

	if file_path.endswith('.csv'):
	df = pd.read_csv(file_path)
	product_name = "Product" # Default name for CSV
	processed_df, sentiment_counts, avg_score = process_single_sheet(df, product_name, client)
	all_processed_dfs[product_name] = processed_df
	sentiment_results[product_name] = sentiment_counts
	avg_sentiment_scores[product_name] = avg_score

	elif file_path.endswith(('.xlsx', '.xls')):
	excel_file = pd.ExcelFile(file_path)
	for sheet_name in excel_file.sheet_names:
	df = pd.read_excel(file_path, sheet_name=sheet_name)
	processed_df, sentiment_counts, avg_score = process_single_sheet(df, sheet_name, client)
	all_processed_dfs[sheet_name] = processed_df
	sentiment_results[sheet_name] = sentiment_counts
	avg_sentiment_scores[sheet_name] = avg_score
	else:
	raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")

	# Create visualizations with new sentiment score chart
	score_comparison_fig, distribution_fig, ratio_fig, summary_df = create_comparison_charts(
	sentiment_results, avg_sentiment_scores
	)

	# Save results
	output_path = "sentiment_analysis_results.xlsx"
	with pd.ExcelWriter(output_path) as writer:
	for sheet_name, df in all_processed_dfs.items():
	df.to_excel(writer, sheet_name=sheet_name, index=False)
	if isinstance(summary_df, pd.DataFrame): # Safety check
	summary_df.to_excel(writer, sheet_name='Summary', index=False)

	return score_comparison_fig, distribution_fig, summary_df, output_path

	except Exception as e:
	raise gr.Error(str(e))


	# Update the Gradio interface
	with gr.Blocks() as interface:
	gr.Markdown("# Product Review Sentiment Analysis")

	gr.Markdown("""
	### Quick Guide
	1. Excel File (Multiple Products):
	- Create separate sheets for each product
	- Name sheets with product/company names
	- Include "Reviews" column in each sheet

	2. CSV File (Single Product):
	- Include "Reviews" column

	Upload your file and click Analyze to get started.
	""")

	with gr.Row():
	api_key_input = gr.Textbox(
	label="OpenAI API Key",
	placeholder="Enter your OpenAI API key",
	type="password"
	)

	with gr.Row():
	file_input = gr.File(
	label="Upload File (CSV or Excel)",
	file_types=[".csv", ".xlsx", ".xls"]
	)

	with gr.Row():
	analyze_btn = gr.Button("Analyze Sentiments")

	with gr.Row():
	sentiment_score_plot = gr.Plot(label="Weighted Sentiment Scores")

	with gr.Row():
	distribution_plot = gr.Plot(label="Sentiment Distribution")

	with gr.Row():
	summary_table = gr.Dataframe(label="Summary Metrics")

	with gr.Row():
	output_file = gr.File(label="Download Full Report")

	analyze_btn.click(
	fn=process_file,
	inputs=[file_input, api_key_input],
	outputs=[sentiment_score_plot, distribution_plot, summary_table, output_file]
	)

	# Launch interface
	interface.launch()