Spaces:

Alexvatti
/

Sentiment-Analysis-BERT

Sleeping

App Files Files Community

Alexvatti commited on Dec 6, 2024

Commit

b14184c

verified ·

1 Parent(s): f94741c

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -49

app.py CHANGED Viewed

@@ -3,81 +3,85 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
-from transformers import BertTokenizer, TFBertModel
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.models import load_model
-from sklearn.metrics import classification_report,confusion_matrix
-import re
 import nltk
-from nltk.corpus import stopwords
 nltk.download('stopwords')
-# Load the tokenizer and model
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 bert_model = TFBertModel.from_pretrained("bert-base-uncased")
-# Define function to create embeddings
-def bert_embeddings(texts, max_length=64):
-    inputs = tokenizer(
-        texts.tolist(),
-        return_tensors="tf",
-        padding=True,
-        truncation=True,
-        max_length=max_length
-    )
-    outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
-    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token's embedding
-    return cls_embeddings
 file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
-movies_df=pd.read_csv(file_path)
 def remove_tags(txt):
-    removelist = ""  # Add any characters you'd like to keep
-    # Remove HTML tags
     result = re.sub(r'<[^>]+>', '', txt)
-    # Remove URLs
-    result = re.sub(r'https?://\S+', '', txt)
-    # Remove non-alphanumeric characters (except for those in the removelist)
-    result = re.sub(r'[^a-zA-Z0-9' + removelist + r'\s]', ' ', txt)
-    # Convert to lowercase
-    result = result.lower()
-    return result
-def remove_stop_wrods(txt):
     stop_words = set(stopwords.words('english'))
-    return ' '.join([word for word in txt.split() if word not in (stop_words)])
 movies_df['review'] = movies_df['review'].apply(remove_tags)
-movies_df['review'] = movies_df['review'].apply(remove_stop_wrods)
-movies_df["Category"]=movies_df["sentiment"].apply(lambda x: 1 if x=='positive' else 0)
-X_train,X_test,y_train,y_test=train_test_split(movies_df['review'],movies_df["Category"],test_size=0.2,random_state=42)
-# Convert emails to BERT embeddings
-X_train_embeddings = bert_embeddings(X_train)
-X_test_embeddings = bert_embeddings(X_test)
-# Define a simple classifier model
 classifier = Sequential([
     Dense(128, activation='relu', input_shape=(768,)),
-    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
 ])
-# Compile the classifier
 classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
-# Train the classifier
 classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)
-# Evaluate on test set
 test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
 print(f"Test Accuracy: {test_accuracy}")
 # Predictions and confusion matrix
 y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
 conf_matrix = confusion_matrix(y_test, y_pred)

 import numpy as np
 import pandas as pd
+import re
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from transformers import BertTokenizer, TFBertModel
 from sklearn.model_selection import train_test_split
+from nltk.corpus import stopwords
 import tensorflow as tf
 import nltk
+# Download stopwords
 nltk.download('stopwords')
+# Load tokenizer and model
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 bert_model = TFBertModel.from_pretrained("bert-base-uncased")
+# Load dataset
 file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
+movies_df = pd.read_csv(file_path)
+# Clean text
 def remove_tags(txt):
     result = re.sub(r'<[^>]+>', '', txt)
+    result = re.sub(r'https?://\S+', '', result)
+    result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
+    return result.lower()
+def remove_stop_words(txt):
     stop_words = set(stopwords.words('english'))
+    return ' '.join([word for word in txt.split() if word not in stop_words])
 movies_df['review'] = movies_df['review'].apply(remove_tags)
+movies_df['review'] = movies_df['review'].apply(remove_stop_words)
+movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
+# Train-test split
+X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42)
+# Convert labels to TensorFlow format
+y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
+y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)
+# Batch-wise BERT embeddings
+def bert_embeddings_batch(texts, batch_size=32, max_length=64):
+    embeddings = []
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i + batch_size]
+        inputs = tokenizer(
+            batch_texts.tolist(),
+            return_tensors="tf",
+            padding=True,
+            truncation=True,
+            max_length=max_length
+        )
+        outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
+        cls_embeddings = outputs.last_hidden_state[:, 0, :]
+        embeddings.append(cls_embeddings.numpy())
+    return np.vstack(embeddings)
+# Compute embeddings
+X_train_embeddings = bert_embeddings_batch(X_train)
+X_test_embeddings = bert_embeddings_batch(X_test)
+# Define classifier
 classifier = Sequential([
     Dense(128, activation='relu', input_shape=(768,)),
+    Dense(1, activation='sigmoid')
 ])
 classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+# Train classifier
 classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)
+# Evaluate
 test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
 print(f"Test Accuracy: {test_accuracy}")
 # Predictions and confusion matrix
 y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
 conf_matrix = confusion_matrix(y_test, y_pred)