Spaces:

logasanjeev
/

sentiment-analysis-bilstm-luong-demo

Running

App Files Files Community

logasanjeev commited on Apr 14

Commit

adfdacf

verified ·

1 Parent(s): 697d739

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -48

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import nltk
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 import re
-import string
 from huggingface_hub import hf_hub_download
 import warnings
 from sklearn.exceptions import InconsistentVersionWarning
@@ -27,60 +27,71 @@ nltk.download('omw-1.4', quiet=True)
 # Initialize lemmatizer
 lemmatizer = WordNetLemmatizer()
-# Define LuongAttention
 class LuongAttention(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         super(LuongAttention, self).__init__(**kwargs)
     def build(self, input_shape):
-        self.W = self.add_weight(
-            name='attention_weight',
-            shape=(input_shape[-1], input_shape[-1]),
-            initializer='glorot_uniform',
-            trainable=True
-        )
-        self.b = self.add_weight(
-            name='attention_bias',
-            shape=(input_shape[-1],),
-            initializer='zeros',
-            trainable=True
-        )
         super(LuongAttention, self).build(input_shape)
     def call(self, inputs):
-        lstm_output = inputs
-        score = tf.matmul(lstm_output, self.W) + self.b
-        score = tf.tanh(score)
-        attention_weights = tf.nn.softmax(score, axis=1)
-        context = lstm_output * attention_weights
-        context = tf.reduce_sum(context, axis=1)
-        return context, attention_weights
     def get_config(self):
         config = super(LuongAttention, self).get_config()
         return config
 # Load model, tokenizer, label encoder from Hugging Face Hub
 model_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="sentiment_model.h5")
-tokenizer_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="tokenizer.json")
 encoder_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="label_encoder.pkl")
 model = load_model(model_path, custom_objects={"LuongAttention": LuongAttention})
-with open(tokenizer_path, "r") as f:
-    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json.load(f))
 with open(encoder_path, "rb") as f:
     label_encoder = pickle.load(f)
-# Text cleaning function
 def clean_text(text):
     if not isinstance(text, str):
         text = str(text)
     text = text.lower()
     text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
-    text = re.sub(r'@\w+|\#\w+', '', text)
-    text = text.translate(str.maketrans('', '', string.punctuation))
-    text = re.sub(r'\d+', '', text)
     tokens = word_tokenize(text)
-    tokens = [lemmatizer.lemmatize(token) for token in tokens]
     return ' '.join(tokens).strip()
 # Prediction function
@@ -95,21 +106,23 @@ def predict_sentiment(text):
         return "Text too short or invalid.", None, None
     # Pad sequence
-    max_len = 35
     pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
     # Predict
     with tf.device('/CPU:0'):
-        pred = model.predict(pad, verbose=0)[0]
-    sentiment = label_encoder.inverse_transform([np.argmax(pred)])[0]
-    probs = pred.tolist()
     # Format output
-    emoji = {"negative": "😣", "neutral": "😐", "positive": "😊"}
     probs_dict = {
-        "Negative": probs[0],
-        "Neutral": probs[1],
-        "Positive": probs[2]
     }
     return (
@@ -135,7 +148,7 @@ with gr.Blocks(theme="soft", css=css) as demo:
     gr.Markdown(
         """
         # Sentiment Analysis App
-        Predict the sentiment of your text (negative, neutral, positive) using a Bi-LSTM model with Luong attention. Try it out!
         """
     )
@@ -143,7 +156,7 @@ with gr.Blocks(theme="soft", css=css) as demo:
         with gr.Column(scale=3):
             text_input = gr.Textbox(
                 label="Your Text",
-                placeholder="e.g., The food service is not good at all",
                 lines=2
             )
             predict_btn = gr.Button("Analyze Sentiment", variant="primary")
@@ -154,11 +167,11 @@ with gr.Blocks(theme="soft", css=css) as demo:
     examples = gr.Examples(
         examples=[
-            "the food service is not good at all",
-            "this is not recommended at all",
-            "This place sucks!",
-            "I’m so happy with this!",
-            "It’s alright, I guess."
         ],
         inputs=text_input
     )
@@ -179,4 +192,4 @@ with gr.Blocks(theme="soft", css=css) as demo:
     )
 # Launch app
-demo.launch()

 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 import re
+import contractions
 from huggingface_hub import hf_hub_download
 import warnings
 from sklearn.exceptions import InconsistentVersionWarning
 # Initialize lemmatizer
 lemmatizer = WordNetLemmatizer()
+# Define LuongAttention (matches training)
 class LuongAttention(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         super(LuongAttention, self).__init__(**kwargs)
     def build(self, input_shape):
+        self.W = self.add_weight(name='attention_weight',
+                                 shape=(input_shape[-1], input_shape[-1]),
+                                 initializer='glorot_normal',
+                                 trainable=True)
+        self.b = self.add_weight(name='attention_bias',
+                                 shape=(input_shape[-1],),
+                                 initializer='zeros',
+                                 trainable=True)
         super(LuongAttention, self).build(input_shape)
     def call(self, inputs):
+        e = tf.keras.backend.tanh(tf.keras.backend.dot(inputs, self.W) + self.b)
+        alpha = tf.keras.backend.softmax(e, axis=1)
+        context = inputs * alpha
+        context = tf.keras.backend.sum(context, axis=1)
+        return context
     def get_config(self):
         config = super(LuongAttention, self).get_config()
         return config
 # Load model, tokenizer, label encoder from Hugging Face Hub
 model_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="sentiment_model.h5")
+tokenizer_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="tokenizer.pkl")
 encoder_path = hf_hub_download(repo_id="logasanjeev/sentiment-analysis-bilstm-luong", filename="label_encoder.pkl")
 model = load_model(model_path, custom_objects={"LuongAttention": LuongAttention})
+with open(tokenizer_path, "rb") as f:
+    tokenizer = pickle.load(f)
 with open(encoder_path, "rb") as f:
     label_encoder = pickle.load(f)
+# Optimal threshold from training
+OPTIMAL_THRESHOLD = 0.5173
+# Text cleaning function (matches training)
 def clean_text(text):
     if not isinstance(text, str):
         text = str(text)
+    # Expand contractions
+    text = contractions.fix(text)
+    # Convert to lowercase
     text = text.lower()
+    # Remove URLs
     text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    # Remove usernames and hashtags
+    text = re.sub(r'@\w+|#\w+', '', text)
+    # Remove HTML tags
+    text = re.sub(r'<.*?>+', '', text)
+    # Remove newlines
+    text = re.sub(r'\n', '', text)
+    # Remove numbers
+    text = re.sub(r'\w*\d\w*', '', text)
+    # Remove special characters
+    text = re.sub(r'[^\w\s]', '', text)
+    # Remove extra spaces
+    text = ' '.join(text.split())
+    # Tokenize and lemmatize
     tokens = word_tokenize(text)
+    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
     return ' '.join(tokens).strip()
 # Prediction function
         return "Text too short or invalid.", None, None
     # Pad sequence
+    max_len = 60
     pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
     # Predict
     with tf.device('/CPU:0'):
+        prob = model.predict(pad, verbose=0)[0][0]
+    # Apply threshold
+    label_idx = (prob >= OPTIMAL_THRESHOLD).astype(int)
+    sentiment = label_encoder.inverse_transform([label_idx])[0].lower()
+    confidence = prob if sentiment == 'positive' else 1 - prob
     # Format output
+    emoji = {"negative": "😣", "positive": "😊"}
     probs_dict = {
+        "Negative": 1 - prob,
+        "Positive": prob
     }
     return (
     gr.Markdown(
         """
         # Sentiment Analysis App
+        Predict the sentiment of your text (Negative or Positive) using a BiLSTM model with Luong attention. Optimized threshold (0.5173) for 86.58% accuracy. Try it out!
         """
     )
         with gr.Column(scale=3):
             text_input = gr.Textbox(
                 label="Your Text",
+                placeholder="e.g., I wouldn't recommend it to anyone",
                 lines=2
             )
             predict_btn = gr.Button("Analyze Sentiment", variant="primary")
     examples = gr.Examples(
         examples=[
+            "I wouldn't recommend it to anyone",
+            "The food service is not good at all",
+            "This place is awesome!",
+            "I'm so happy with this product!",
+            "Why does everything go wrong today?"
         ],
         inputs=text_input
     )
     )
 # Launch app
+demo.launch()