Alexvatti commited on
Commit
b14184c
·
verified ·
1 Parent(s): f94741c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -49
app.py CHANGED
@@ -3,81 +3,85 @@ import gradio as gr
3
 
4
  import numpy as np
5
  import pandas as pd
 
 
 
 
6
  from sklearn.model_selection import train_test_split
7
-
8
  import tensorflow as tf
9
- from transformers import BertTokenizer, TFBertModel
10
- from tensorflow.keras.layers import Dense
11
- from tensorflow.keras.models import Sequential
12
- from tensorflow.keras.models import load_model
13
-
14
- from sklearn.metrics import classification_report,confusion_matrix
15
- import re
16
  import nltk
17
- from nltk.corpus import stopwords
 
18
  nltk.download('stopwords')
19
 
20
- # Load the tokenizer and model
21
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
22
  bert_model = TFBertModel.from_pretrained("bert-base-uncased")
23
 
24
- # Define function to create embeddings
25
- def bert_embeddings(texts, max_length=64):
26
- inputs = tokenizer(
27
- texts.tolist(),
28
- return_tensors="tf",
29
- padding=True,
30
- truncation=True,
31
- max_length=max_length
32
- )
33
- outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
34
- cls_embeddings = outputs.last_hidden_state[:, 0, :] # CLS token's embedding
35
- return cls_embeddings
36
-
37
  file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
38
- movies_df=pd.read_csv(file_path)
39
 
 
40
  def remove_tags(txt):
41
- removelist = "" # Add any characters you'd like to keep
42
- # Remove HTML tags
43
  result = re.sub(r'<[^>]+>', '', txt)
44
- # Remove URLs
45
- result = re.sub(r'https?://\S+', '', txt)
46
- # Remove non-alphanumeric characters (except for those in the removelist)
47
- result = re.sub(r'[^a-zA-Z0-9' + removelist + r'\s]', ' ', txt)
48
- # Convert to lowercase
49
- result = result.lower()
50
- return result
51
-
52
- def remove_stop_wrods(txt):
53
  stop_words = set(stopwords.words('english'))
54
- return ' '.join([word for word in txt.split() if word not in (stop_words)])
55
 
56
  movies_df['review'] = movies_df['review'].apply(remove_tags)
57
- movies_df['review'] = movies_df['review'].apply(remove_stop_wrods)
58
- movies_df["Category"]=movies_df["sentiment"].apply(lambda x: 1 if x=='positive' else 0)
59
-
60
- X_train,X_test,y_train,y_test=train_test_split(movies_df['review'],movies_df["Category"],test_size=0.2,random_state=42)
61
- # Convert emails to BERT embeddings
62
- X_train_embeddings = bert_embeddings(X_train)
63
- X_test_embeddings = bert_embeddings(X_test)
64
-
65
- # Define a simple classifier model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  classifier = Sequential([
67
  Dense(128, activation='relu', input_shape=(768,)),
68
- Dense(1, activation='sigmoid') # Sigmoid for binary classification
69
  ])
70
 
71
- # Compile the classifier
72
  classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
73
 
74
- # Train the classifier
75
  classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)
76
 
77
- # Evaluate on test set
78
  test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
79
  print(f"Test Accuracy: {test_accuracy}")
80
 
 
81
  # Predictions and confusion matrix
82
  y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
83
  conf_matrix = confusion_matrix(y_test, y_pred)
 
3
 
4
  import numpy as np
5
  import pandas as pd
6
+ import re
7
+ from tensorflow.keras.models import Sequential
8
+ from tensorflow.keras.layers import Dense
9
+ from transformers import BertTokenizer, TFBertModel
10
  from sklearn.model_selection import train_test_split
11
+ from nltk.corpus import stopwords
12
  import tensorflow as tf
 
 
 
 
 
 
 
13
  import nltk
14
+
15
+ # Download stopwords
16
  nltk.download('stopwords')
17
 
18
+ # Load tokenizer and model
19
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
20
  bert_model = TFBertModel.from_pretrained("bert-base-uncased")
21
 
22
+ # Load dataset
 
 
 
 
 
 
 
 
 
 
 
 
23
  file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Movie-Review/IMDB%20Dataset.csv"
24
+ movies_df = pd.read_csv(file_path)
25
 
26
+ # Clean text
27
  def remove_tags(txt):
 
 
28
  result = re.sub(r'<[^>]+>', '', txt)
29
+ result = re.sub(r'https?://\S+', '', result)
30
+ result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
31
+ return result.lower()
32
+
33
+ def remove_stop_words(txt):
 
 
 
 
34
  stop_words = set(stopwords.words('english'))
35
+ return ' '.join([word for word in txt.split() if word not in stop_words])
36
 
37
  movies_df['review'] = movies_df['review'].apply(remove_tags)
38
+ movies_df['review'] = movies_df['review'].apply(remove_stop_words)
39
+ movies_df['Category'] = movies_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
40
+
41
+ # Train-test split
42
+ X_train, X_test, y_train, y_test = train_test_split(movies_df['review'], movies_df['Category'], test_size=0.2, random_state=42)
43
+
44
+ # Convert labels to TensorFlow format
45
+ y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
46
+ y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)
47
+
48
+ # Batch-wise BERT embeddings
49
+ def bert_embeddings_batch(texts, batch_size=32, max_length=64):
50
+ embeddings = []
51
+ for i in range(0, len(texts), batch_size):
52
+ batch_texts = texts[i:i + batch_size]
53
+ inputs = tokenizer(
54
+ batch_texts.tolist(),
55
+ return_tensors="tf",
56
+ padding=True,
57
+ truncation=True,
58
+ max_length=max_length
59
+ )
60
+ outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
61
+ cls_embeddings = outputs.last_hidden_state[:, 0, :]
62
+ embeddings.append(cls_embeddings.numpy())
63
+ return np.vstack(embeddings)
64
+
65
+ # Compute embeddings
66
+ X_train_embeddings = bert_embeddings_batch(X_train)
67
+ X_test_embeddings = bert_embeddings_batch(X_test)
68
+
69
+ # Define classifier
70
  classifier = Sequential([
71
  Dense(128, activation='relu', input_shape=(768,)),
72
+ Dense(1, activation='sigmoid')
73
  ])
74
 
 
75
  classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
76
 
77
+ # Train classifier
78
  classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)
79
 
80
+ # Evaluate
81
  test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
82
  print(f"Test Accuracy: {test_accuracy}")
83
 
84
+
85
  # Predictions and confusion matrix
86
  y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
87
  conf_matrix = confusion_matrix(y_test, y_pred)