Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ import nltk
|
|
10 |
from nltk.tokenize import word_tokenize
|
11 |
from nltk.stem import WordNetLemmatizer
|
12 |
import re
|
13 |
-
import
|
14 |
from huggingface_hub import hf_hub_download
|
15 |
import warnings
|
16 |
from sklearn.exceptions import InconsistentVersionWarning
|
@@ -27,60 +27,71 @@ nltk.download('omw-1.4', quiet=True)
|
|
27 |
# Initialize lemmatizer
|
28 |
lemmatizer = WordNetLemmatizer()
|
29 |
|
30 |
-
# Define LuongAttention
|
31 |
class LuongAttention(tf.keras.layers.Layer):
|
32 |
def __init__(self, **kwargs):
|
33 |
super(LuongAttention, self).__init__(**kwargs)
|
34 |
-
|
35 |
def build(self, input_shape):
|
36 |
-
self.W = self.add_weight(
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
shape=(input_shape[-1],),
|
45 |
-
initializer='zeros',
|
46 |
-
trainable=True
|
47 |
-
)
|
48 |
super(LuongAttention, self).build(input_shape)
|
49 |
-
|
50 |
def call(self, inputs):
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
context
|
56 |
-
|
57 |
-
return context, attention_weights
|
58 |
-
|
59 |
def get_config(self):
|
60 |
config = super(LuongAttention, self).get_config()
|
61 |
return config
|
62 |
|
63 |
# Load model, tokenizer, label encoder from Hugging Face Hub
|
64 |
model_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="sentiment_model.h5")
|
65 |
-
tokenizer_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="tokenizer.
|
66 |
encoder_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="label_encoder.pkl")
|
67 |
model = load_model(model_path, custom_objects={"LuongAttention": LuongAttention})
|
68 |
-
with open(tokenizer_path, "
|
69 |
-
tokenizer =
|
70 |
with open(encoder_path, "rb") as f:
|
71 |
label_encoder = pickle.load(f)
|
72 |
|
73 |
-
#
|
|
|
|
|
|
|
74 |
def clean_text(text):
|
75 |
if not isinstance(text, str):
|
76 |
text = str(text)
|
|
|
|
|
|
|
77 |
text = text.lower()
|
|
|
78 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
79 |
-
|
80 |
-
text =
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
tokens = word_tokenize(text)
|
83 |
-
tokens = [lemmatizer.lemmatize(token) for token in tokens]
|
84 |
return ' '.join(tokens).strip()
|
85 |
|
86 |
# Prediction function
|
@@ -95,21 +106,23 @@ def predict_sentiment(text):
|
|
95 |
return "Text too short or invalid.", None, None
|
96 |
|
97 |
# Pad sequence
|
98 |
-
max_len =
|
99 |
pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
|
100 |
|
101 |
# Predict
|
102 |
with tf.device('/CPU:0'):
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
106 |
|
107 |
# Format output
|
108 |
-
emoji = {"negative": "π£", "
|
109 |
probs_dict = {
|
110 |
-
"Negative":
|
111 |
-
"
|
112 |
-
"Positive": probs[2]
|
113 |
}
|
114 |
|
115 |
return (
|
@@ -135,7 +148,7 @@ with gr.Blocks(theme="soft", css=css) as demo:
|
|
135 |
gr.Markdown(
|
136 |
"""
|
137 |
# Sentiment Analysis App
|
138 |
-
Predict the sentiment of your text (
|
139 |
"""
|
140 |
)
|
141 |
|
@@ -143,7 +156,7 @@ with gr.Blocks(theme="soft", css=css) as demo:
|
|
143 |
with gr.Column(scale=3):
|
144 |
text_input = gr.Textbox(
|
145 |
label="Your Text",
|
146 |
-
placeholder="e.g.,
|
147 |
lines=2
|
148 |
)
|
149 |
predict_btn = gr.Button("Analyze Sentiment", variant="primary")
|
@@ -154,11 +167,11 @@ with gr.Blocks(theme="soft", css=css) as demo:
|
|
154 |
|
155 |
examples = gr.Examples(
|
156 |
examples=[
|
157 |
-
"
|
158 |
-
"
|
159 |
-
"This place
|
160 |
-
"I
|
161 |
-
"
|
162 |
],
|
163 |
inputs=text_input
|
164 |
)
|
@@ -179,4 +192,4 @@ with gr.Blocks(theme="soft", css=css) as demo:
|
|
179 |
)
|
180 |
|
181 |
# Launch app
|
182 |
-
demo.launch()
|
|
|
10 |
from nltk.tokenize import word_tokenize
|
11 |
from nltk.stem import WordNetLemmatizer
|
12 |
import re
|
13 |
+
import contractions
|
14 |
from huggingface_hub import hf_hub_download
|
15 |
import warnings
|
16 |
from sklearn.exceptions import InconsistentVersionWarning
|
|
|
27 |
# Initialize lemmatizer
|
28 |
lemmatizer = WordNetLemmatizer()
|
29 |
|
30 |
+
# Define LuongAttention (matches training)
|
31 |
class LuongAttention(tf.keras.layers.Layer):
|
32 |
def __init__(self, **kwargs):
|
33 |
super(LuongAttention, self).__init__(**kwargs)
|
34 |
+
|
35 |
def build(self, input_shape):
|
36 |
+
self.W = self.add_weight(name='attention_weight',
|
37 |
+
shape=(input_shape[-1], input_shape[-1]),
|
38 |
+
initializer='glorot_normal',
|
39 |
+
trainable=True)
|
40 |
+
self.b = self.add_weight(name='attention_bias',
|
41 |
+
shape=(input_shape[-1],),
|
42 |
+
initializer='zeros',
|
43 |
+
trainable=True)
|
|
|
|
|
|
|
|
|
44 |
super(LuongAttention, self).build(input_shape)
|
45 |
+
|
46 |
def call(self, inputs):
|
47 |
+
e = tf.keras.backend.tanh(tf.keras.backend.dot(inputs, self.W) + self.b)
|
48 |
+
alpha = tf.keras.backend.softmax(e, axis=1)
|
49 |
+
context = inputs * alpha
|
50 |
+
context = tf.keras.backend.sum(context, axis=1)
|
51 |
+
return context
|
52 |
+
|
|
|
|
|
53 |
def get_config(self):
|
54 |
config = super(LuongAttention, self).get_config()
|
55 |
return config
|
56 |
|
57 |
# Load model, tokenizer, label encoder from Hugging Face Hub
|
58 |
model_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="sentiment_model.h5")
|
59 |
+
tokenizer_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="tokenizer.pkl")
|
60 |
encoder_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="label_encoder.pkl")
|
61 |
model = load_model(model_path, custom_objects={"LuongAttention": LuongAttention})
|
62 |
+
with open(tokenizer_path, "rb") as f:
|
63 |
+
tokenizer = pickle.load(f)
|
64 |
with open(encoder_path, "rb") as f:
|
65 |
label_encoder = pickle.load(f)
|
66 |
|
67 |
+
# Optimal threshold from training
|
68 |
+
OPTIMAL_THRESHOLD = 0.5173
|
69 |
+
|
70 |
+
# Text cleaning function (matches training)
|
71 |
def clean_text(text):
|
72 |
if not isinstance(text, str):
|
73 |
text = str(text)
|
74 |
+
# Expand contractions
|
75 |
+
text = contractions.fix(text)
|
76 |
+
# Convert to lowercase
|
77 |
text = text.lower()
|
78 |
+
# Remove URLs
|
79 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
80 |
+
# Remove usernames and hashtags
|
81 |
+
text = re.sub(r'@\w+|#\w+', '', text)
|
82 |
+
# Remove HTML tags
|
83 |
+
text = re.sub(r'<.*?>+', '', text)
|
84 |
+
# Remove newlines
|
85 |
+
text = re.sub(r'\n', '', text)
|
86 |
+
# Remove numbers
|
87 |
+
text = re.sub(r'\w*\d\w*', '', text)
|
88 |
+
# Remove special characters
|
89 |
+
text = re.sub(r'[^\w\s]', '', text)
|
90 |
+
# Remove extra spaces
|
91 |
+
text = ' '.join(text.split())
|
92 |
+
# Tokenize and lemmatize
|
93 |
tokens = word_tokenize(text)
|
94 |
+
tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
|
95 |
return ' '.join(tokens).strip()
|
96 |
|
97 |
# Prediction function
|
|
|
106 |
return "Text too short or invalid.", None, None
|
107 |
|
108 |
# Pad sequence
|
109 |
+
max_len = 60
|
110 |
pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
|
111 |
|
112 |
# Predict
|
113 |
with tf.device('/CPU:0'):
|
114 |
+
prob = model.predict(pad, verbose=0)[0][0]
|
115 |
+
|
116 |
+
# Apply threshold
|
117 |
+
label_idx = (prob >= OPTIMAL_THRESHOLD).astype(int)
|
118 |
+
sentiment = label_encoder.inverse_transform([label_idx])[0].lower()
|
119 |
+
confidence = prob if sentiment == 'positive' else 1 - prob
|
120 |
|
121 |
# Format output
|
122 |
+
emoji = {"negative": "π£", "positive": "π"}
|
123 |
probs_dict = {
|
124 |
+
"Negative": 1 - prob,
|
125 |
+
"Positive": prob
|
|
|
126 |
}
|
127 |
|
128 |
return (
|
|
|
148 |
gr.Markdown(
|
149 |
"""
|
150 |
# Sentiment Analysis App
|
151 |
+
Predict the sentiment of your text (Negative or Positive) using a BiLSTM model with Luong attention. Optimized threshold (0.5173) for 86.58% accuracy. Try it out!
|
152 |
"""
|
153 |
)
|
154 |
|
|
|
156 |
with gr.Column(scale=3):
|
157 |
text_input = gr.Textbox(
|
158 |
label="Your Text",
|
159 |
+
placeholder="e.g., I wouldn't recommend it to anyone",
|
160 |
lines=2
|
161 |
)
|
162 |
predict_btn = gr.Button("Analyze Sentiment", variant="primary")
|
|
|
167 |
|
168 |
examples = gr.Examples(
|
169 |
examples=[
|
170 |
+
"I wouldn't recommend it to anyone",
|
171 |
+
"The food service is not good at all",
|
172 |
+
"This place is awesome!",
|
173 |
+
"I'm so happy with this product!",
|
174 |
+
"Why does everything go wrong today?"
|
175 |
],
|
176 |
inputs=text_input
|
177 |
)
|
|
|
192 |
)
|
193 |
|
194 |
# Launch app
|
195 |
+
demo.launch()
|