Spaces:

Alexvatti
/

Sentiment-Analysis-BERT

Sleeping

App Files Files Community

Sentiment-Analysis-BERT / Movie-Review-Sentiment.py

Alexvatti

Create Movie-Review-Sentiment.py

fe30970 verified 5 months ago

raw

history blame contribute delete

3.31 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import re
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Dense
	from transformers import BertTokenizer, TFBertModel
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix, classification_report
	from nltk.corpus import stopwords
	import tensorflow as tf
	import nltk

	# Download stopwords
	nltk.download('stopwords')

	# Load tokenizer and model
	tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
	bert_model = TFBertModel.from_pretrained("bert-base-uncased")

	# Load dataset
	file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
	movies_df = pd.read_csv(file_path)

	# Clean text
	def remove_tags(txt):
	result = re.sub(r'<[^>]+>', '', txt)
	result = re.sub(r'https?://\S+', '', result)
	result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
	return result.lower()

	def remove_stop_words(txt):
	stop_words = set(stopwords.words('english'))
	return ' '.join([word for word in txt.split() if word not in stop_words])

	movies_df['review'] = movies_df['review'].apply(remove_tags)
	movies_df['review'] = movies_df['review'].apply(remove_stop_words)
	movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42)

	# Convert labels to TensorFlow format
	y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
	y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

	# Compute BERT embeddings
	def bert_embeddings_batch(texts, batch_size=32, max_length=64):
	embeddings = []
	for i in range(0, len(texts), batch_size):
	batch_texts = texts[i:i + batch_size]
	inputs = tokenizer(
	batch_texts.tolist(),
	return_tensors="tf",
	padding=True,
	truncation=True,
	max_length=max_length
	)
	outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
	cls_embeddings = outputs.last_hidden_state[:, 0, :]
	embeddings.append(cls_embeddings.numpy())
	return np.vstack(embeddings)

	# Compute embeddings
	X_train_embeddings = bert_embeddings_batch(X_train)
	X_test_embeddings = bert_embeddings_batch(X_test)

	# Define classifier
	classifier = Sequential([
	Dense(128, activation='relu', input_shape=(768,)),
	Dense(1, activation='sigmoid')
	])

	classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

	# Train classifier
	classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)

	# Evaluate
	test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
	print(f"Test Accuracy: {test_accuracy}")

	# Predictions and confusion matrix
	y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
	conf_matrix = confusion_matrix(y_test.numpy(), y_pred)
	class_report = classification_report(y_test.numpy(), y_pred)

	print("Confusion Matrix:")
	print(conf_matrix)
	print("\nClassification Report:")
	print(class_report)

	# Save the trained model to a file
	classifier.save("movie_sentiment_model.h5")