File size: 5,373 Bytes
4cd06db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import re
import emoji
import joblib
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
import torch
import torch.nn as nn
import torchtext.vocab as vocab
import gradio as gr
# Let's first load glove model
glove = vocab.GloVe(name='6B', dim=100)
def remove_html(text) :
patt_html = r"<.*?>"
text = re.sub(patt_html, "", text)
return text
def remove_url(text):
patt_url = r"https?://\S+|www\.\S+"
text = re.sub(patt_url, "", text)
return text
def emoji_to_text(text) :
res_str = ""
for ch in text :
if emoji.is_emoji(ch) :
res_str += f" {emoji.demojize(ch)} "
# print(ch, emoji.demojize(ch))
else :
res_str += ch
return res_str
def clean_review_text(text):
# remove HTML Tags
text = remove_html(text)
# remove url to call function remover_url
text = remove_url(text)
# convert text emoji into text
text = emoji_to_text(text)
# convert all text into lower case
text = text.lower()
return text
# Main CNN model defien
class CNNHotelReviewsModel(nn.Module):
def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
super().__init__()
self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1,
out_channels=n_filters,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
self.dropout = nn.Dropout(dropout)
self.sigmoid = nn.Sigmoid()
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
cat = self.dropout(torch.cat(pooled, dim=1))
return self.sigmoid(self.fc(cat)).squeeze(1)
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 100
OUTPUT_DIM = 1 # Positive and Negative classes
N_FILTERS = 250
FILTER_SIZES = [2, 3, 4]
DROPOUT = 0.1
# Best Hyperparameters: {'n_filters': 250, 'filter_sizes': [2, 3, 4], 'dropout': 0.1}
CNN_Model = CNNHotelReviewsModel(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
# Load the saved state_dict into the model
CNN_Model.load_state_dict(torch.load("hotel_review_model.pth", map_location=device))
CNN_Model = CNN_Model.to(device) # Move the model to GPU
CNN_Model.eval() # Set the model to evaluation mode
# For Aspect selection lda model impored
lda_model = joblib.load('lda_model.pkl')
dictionary = joblib.load('dictionary.pkl')
# CNN prediction model
def predict_review(model, review, max_len=128):
# Tokenize and convert to indices
tokens = review.split()
indices = [glove.stoi.get(token, 0) for token in tokens]
# Pad or truncate to max_len
if len(indices) < max_len:
indices += [0] * (max_len - len(indices))
else:
indices = indices[:max_len]
# Convert to tensor and add batch dimension
tensor = torch.tensor(indices).unsqueeze(0)
# Forward pass
model.eval() # Set model to evaluation mode
with torch.no_grad():
output = model(tensor.to(device))
# Convert output probability to class label (0 or 1)
prob = output.item()
# prediction = 1 if prob > 0.5 else 0
return {'positive': prob, 'negative': 1-prob}
# Now On the basis of above assumsiom let's create aspect_label dictionary.
aspect_label = {
0: "Reception & Service Efficiency",
1: "Transportation & Proximity",
2: "Room Comfort & Staff Courtesy",
3: "Location & Staff Quality",
4: "Room Discrepancies",
5: "Hotel Quality vs Price",
6: "Booking & Payment Issues",
7: "Room Ambiance & Noise",
8: "Amenities & Value",
9: "Room Size & Condition",
}
def dominant_topic(text):
text = text.split()
bow = dictionary.doc2bow(text)
topics = lda_model.get_document_topics(bow)
main_topic = max(topics, key=lambda x: x[1])
return { aspect_label[itm[0]]: float(itm[1]) for itm in topics } #main_topic[0]
def gr_fun(Review):
review = clean_review_text(Review)
pred_label = predict_review(CNN_Model, review)
pred_aspect = dominant_topic(review)
return pred_label, pred_aspect
iface = gr.Interface(
fn=gr_fun,
inputs="text",
outputs=[gr.Label(), gr.Label(num_top_classes=5)],
examples=[
"room condition was very bad",
"Staff where excellent and the room was lovely really great hotel will definitely be back",
"Couldn t find ice machine The junior suite was excellent with a fantastic bar",
"Furniture in the room was a bit worn and tired for the money you pay would just expect a bit more it was ok",
"Room was West facing and was far too warm particularly as the a c didn t seem to be working to well The shower room was excellent and large enough for my lady and I to be rude in Loved it"
]
)
# iface = gr.Interface(fn=return_label_aspect, inputs="text", outputs=[gr.Label(), gr.Label()])
iface.launch(inline = False) |