Spaces:

Alexvatti
/

Sentiment-Analysis-BERT

Sleeping

File size: 3,310 Bytes

fe30970

import gradio as gr
import numpy as np
import pandas as pd
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
import tensorflow as tf
import nltk

# Download stopwords
nltk.download('stopwords')

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Load dataset
file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
movies_df = pd.read_csv(file_path)

# Clean text
def remove_tags(txt):
    result = re.sub(r'<[^>]+>', '', txt)
    result = re.sub(r'https?://\S+', '', result)
    result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
    return result.lower()

def remove_stop_words(txt):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in txt.split() if word not in stop_words])

movies_df['review'] = movies_df['review'].apply(remove_tags)
movies_df['review'] = movies_df['review'].apply(remove_stop_words)
movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42)

# Convert labels to TensorFlow format
y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

# Compute BERT embeddings
def bert_embeddings_batch(texts, batch_size=32, max_length=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(
            batch_texts.tolist(),
            return_tensors="tf",
            padding=True,
            truncation=True,
            max_length=max_length
        )
        outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embeddings.numpy())
    return np.vstack(embeddings)

# Compute embeddings
X_train_embeddings = bert_embeddings_batch(X_train)
X_test_embeddings = bert_embeddings_batch(X_test)

# Define classifier
classifier = Sequential([
    Dense(128, activation='relu', input_shape=(768,)),
    Dense(1, activation='sigmoid')
])

classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train classifier
classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate
test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions and confusion matrix
y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
conf_matrix = confusion_matrix(y_test.numpy(), y_pred)
class_report = classification_report(y_test.numpy(), y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Save the trained model to a file
classifier.save("movie_sentiment_model.h5")