Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import re | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Dense | |
from transformers import BertTokenizer, TFBertModel | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix, classification_report | |
from nltk.corpus import stopwords | |
import tensorflow as tf | |
import nltk | |
# Download stopwords | |
nltk.download('stopwords') | |
# Load tokenizer and model | |
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
bert_model = TFBertModel.from_pretrained("bert-base-uncased") | |
# Load dataset | |
file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv" | |
movies_df = pd.read_csv(file_path) | |
# Clean text | |
def remove_tags(txt): | |
result = re.sub(r'<[^>]+>', '', txt) | |
result = re.sub(r'https?://\S+', '', result) | |
result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result) | |
return result.lower() | |
def remove_stop_words(txt): | |
stop_words = set(stopwords.words('english')) | |
return ' '.join([word for word in txt.split() if word not in stop_words]) | |
movies_df['review'] = movies_df['review'].apply(remove_tags) | |
movies_df['review'] = movies_df['review'].apply(remove_stop_words) | |
movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0) | |
# Train-test split | |
X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42) | |
# Convert labels to TensorFlow format | |
y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32) | |
y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32) | |
# Compute BERT embeddings | |
def bert_embeddings_batch(texts, batch_size=32, max_length=64): | |
embeddings = [] | |
for i in range(0, len(texts), batch_size): | |
batch_texts = texts[i:i + batch_size] | |
inputs = tokenizer( | |
batch_texts.tolist(), | |
return_tensors="tf", | |
padding=True, | |
truncation=True, | |
max_length=max_length | |
) | |
outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) | |
cls_embeddings = outputs.last_hidden_state[:, 0, :] | |
embeddings.append(cls_embeddings.numpy()) | |
return np.vstack(embeddings) | |
# Compute embeddings | |
X_train_embeddings = bert_embeddings_batch(X_train) | |
X_test_embeddings = bert_embeddings_batch(X_test) | |
# Define classifier | |
classifier = Sequential([ | |
Dense(128, activation='relu', input_shape=(768,)), | |
Dense(1, activation='sigmoid') | |
]) | |
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
# Train classifier | |
classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1) | |
# Evaluate | |
test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test) | |
print(f"Test Accuracy: {test_accuracy}") | |
# Predictions and confusion matrix | |
y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32") | |
conf_matrix = confusion_matrix(y_test.numpy(), y_pred) | |
class_report = classification_report(y_test.numpy(), y_pred) | |
print("Confusion Matrix:") | |
print(conf_matrix) | |
print("\nClassification Report:") | |
print(class_report) | |
# Save the trained model to a file | |
classifier.save("movie_sentiment_model.h5") | |