Alexvatti commited on
Commit
fe30970
·
verified ·
1 Parent(s): 9cb81e0

Create Movie-Review-Sentiment.py

Browse files
Files changed (1) hide show
  1. Movie-Review-Sentiment.py +95 -0
Movie-Review-Sentiment.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+ from tensorflow.keras.models import Sequential
6
+ from tensorflow.keras.layers import Dense
7
+ from transformers import BertTokenizer, TFBertModel
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import confusion_matrix, classification_report
10
+ from nltk.corpus import stopwords
11
+ import tensorflow as tf
12
+ import nltk
13
+
14
+ # Download stopwords
15
+ nltk.download('stopwords')
16
+
17
+ # Load tokenizer and model
18
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
19
+ bert_model = TFBertModel.from_pretrained("bert-base-uncased")
20
+
21
+ # Load dataset
22
+ file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
23
+ movies_df = pd.read_csv(file_path)
24
+
25
+ # Clean text
26
+ def remove_tags(txt):
27
+ result = re.sub(r'<[^>]+>', '', txt)
28
+ result = re.sub(r'https?://\S+', '', result)
29
+ result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
30
+ return result.lower()
31
+
32
+ def remove_stop_words(txt):
33
+ stop_words = set(stopwords.words('english'))
34
+ return ' '.join([word for word in txt.split() if word not in stop_words])
35
+
36
+ movies_df['review'] = movies_df['review'].apply(remove_tags)
37
+ movies_df['review'] = movies_df['review'].apply(remove_stop_words)
38
+ movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
39
+
40
+ # Train-test split
41
+ X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42)
42
+
43
+ # Convert labels to TensorFlow format
44
+ y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
45
+ y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)
46
+
47
+ # Compute BERT embeddings
48
+ def bert_embeddings_batch(texts, batch_size=32, max_length=64):
49
+ embeddings = []
50
+ for i in range(0, len(texts), batch_size):
51
+ batch_texts = texts[i:i + batch_size]
52
+ inputs = tokenizer(
53
+ batch_texts.tolist(),
54
+ return_tensors="tf",
55
+ padding=True,
56
+ truncation=True,
57
+ max_length=max_length
58
+ )
59
+ outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
60
+ cls_embeddings = outputs.last_hidden_state[:, 0, :]
61
+ embeddings.append(cls_embeddings.numpy())
62
+ return np.vstack(embeddings)
63
+
64
+ # Compute embeddings
65
+ X_train_embeddings = bert_embeddings_batch(X_train)
66
+ X_test_embeddings = bert_embeddings_batch(X_test)
67
+
68
+ # Define classifier
69
+ classifier = Sequential([
70
+ Dense(128, activation='relu', input_shape=(768,)),
71
+ Dense(1, activation='sigmoid')
72
+ ])
73
+
74
+ classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
75
+
76
+ # Train classifier
77
+ classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)
78
+
79
+ # Evaluate
80
+ test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
81
+ print(f"Test Accuracy: {test_accuracy}")
82
+
83
+ # Predictions and confusion matrix
84
+ y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
85
+ conf_matrix = confusion_matrix(y_test.numpy(), y_pred)
86
+ class_report = classification_report(y_test.numpy(), y_pred)
87
+
88
+ print("Confusion Matrix:")
89
+ print(conf_matrix)
90
+ print("\nClassification Report:")
91
+ print(class_report)
92
+
93
+ # Save the trained model to a file
94
+ classifier.save("movie_sentiment_model.h5")
95
+