AA_TT3

Sleeping

App Files Files Community

ahm14 commited on Mar 2

Commit

e465fa1

verified ·

1 Parent(s): 74033b7

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -19

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import re
 import logging
 import nltk
 from docx import Document
 import io
 from langdetect import detect
 from collections import Counter
@@ -46,7 +47,7 @@ tone_categories = {
     "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
 }
-# Frame categories for fallback method
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -65,8 +66,15 @@ frame_categories = {
     "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 }
-# Initialize zero-shot classifier for qualitative frame categorization
-classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
 def detect_language(text):
@@ -78,8 +86,10 @@ def detect_language(text):
 def extract_tone(text):
     try:
-        response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
-                             {"role": "user", "content": text}])
         return response["choices"][0]["message"]["content"].split(", ")
     except Exception as e:
         logging.error(f"Groq API error: {e}")
@@ -97,33 +107,42 @@ def extract_hashtags(text):
     return re.findall(r"#\w+", text)
 # -------------------------------------------------------------------
-# New functions for qualitative frame categorization using zero-shot classification
 # -------------------------------------------------------------------
 def get_frame_category_mapping(text):
     """
-    For each frame category defined in frame_categories, this function uses a zero-shot classification
-    approach to qualitatively assess how strongly the text discusses the frame. The classifier returns one of:
-    "Major Focus", "Significant Focus", "Minor Mention", or "Not Applicable".
     """
     mapping = {}
-    for frame in frame_categories.keys():
-        hypothesis_template = f"This text is {{}} about {frame}."
-        result = classifier(text, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template)
-        best_label = result["labels"][0]  # select the highest scoring label
-        mapping[frame] = best_label
     return mapping
 def format_frame_categories_table(mapping):
     """
-    Returns a markdown-formatted table that displays each frame along with four columns:
     Major Focus, Significant Focus, Minor Mention, and Not Applicable.
-    A tick (✓) is shown only in the column corresponding to the assigned category.
     """
     header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
     header += "| --- | --- | --- | --- | --- |\n"
-    rows = ""
     tick = "✓"
     for frame, category in mapping.items():
         major = tick if category == "Major Focus" else ""
         significant = tick if category == "Significant Focus" else ""
@@ -174,13 +193,38 @@ def create_docx_from_data(extracted_data):
         ordered_keys = [
             "Post Number", "Date of Post", "Media Type", "Number of Pictures",
             "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
-            "Full Caption", "Language", "Tone", "Hashtags", "Frames"
         ]
         for key in ordered_keys:
             value = data.get(key, "N/A")
             if key in ["Tone", "Hashtags"]:
                 value = ", ".join(value) if isinstance(value, list) else value
-            doc.add_paragraph(f"**{key}:** {value}")
         doc.add_paragraph("\n")
     return doc
@@ -206,6 +250,7 @@ if input_text:
         "Tone": extract_tone(input_text),
         "Hashtags": extract_hashtags(input_text),
         "Frames": frames_table,
     }
 if uploaded_docx:
@@ -219,6 +264,7 @@ if uploaded_docx:
             "Tone": extract_tone(text),
             "Hashtags": extract_hashtags(text),
             "Frames": frames_table,
         }
 if uploaded_excel:

 import logging
 import nltk
 from docx import Document
+from docx.shared import Pt
 import io
 from langdetect import detect
 from collections import Counter
     "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
 }
+# Frame categories for qualitative analysis
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
     "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 }
+# Initialize zero-shot classifier with a distilled model, GPU and half precision
+classifier = pipeline(
+    "zero-shot-classification",
+    model="valhalla/distilbart-mnli-12-3",  # distilled and faster model
+    device=0,                               # use GPU if available
+    torch_dtype="auto",                     # or use torch.float16 if you want to force fp16
+    batch_size=16                           # adjust batch_size based on your GPU memory
+)
+# Candidate labels for qualitative categorization (order matters only for display)
 candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
 def detect_language(text):
 def extract_tone(text):
     try:
+        response = llm.chat([
+            {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
+            {"role": "user", "content": text}
+        ])
         return response["choices"][0]["message"]["content"].split(", ")
     except Exception as e:
         logging.error(f"Groq API error: {e}")
     return re.findall(r"#\w+", text)
 # -------------------------------------------------------------------
+# New functions for qualitative frame categorization (batched version)
 # -------------------------------------------------------------------
 def get_frame_category_mapping(text):
     """
+    For each frame defined in frame_categories, this function uses a zero-shot classification
+    approach to assess qualitatively how strongly the text discusses the frame.
+    It builds a batched list of hypothesis templates and returns a mapping from frame to the best label.
     """
+    frames = list(frame_categories.keys())
+    # Build hypothesis templates—one per frame:
+    hypothesis_templates = [f"This text is {{}} about {frame}." for frame in frames]
+    # Repeat the same input text for each frame in the batch.
+    texts = [text] * len(frames)
+    # Batch call: each element of the batch uses its own hypothesis_template.
+    results = classifier(
+        texts,
+        candidate_labels=candidate_labels,
+        hypothesis_template=hypothesis_templates,
+        batch_size=len(frames)  # process all at once
+    )
     mapping = {}
+    for frame, result in zip(frames, results):
+        mapping[frame] = result["labels"][0]
     return mapping
 def format_frame_categories_table(mapping):
     """
+    Returns a markdown-formatted table displaying each frame with columns:
     Major Focus, Significant Focus, Minor Mention, and Not Applicable.
+    A tick (✓) marks the assigned category.
     """
     header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
     header += "| --- | --- | --- | --- | --- |\n"
     tick = "✓"
+    rows = ""
     for frame, category in mapping.items():
         major = tick if category == "Major Focus" else ""
         significant = tick if category == "Significant Focus" else ""
         ordered_keys = [
             "Post Number", "Date of Post", "Media Type", "Number of Pictures",
             "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
+            "Full Caption", "Language", "Tone", "Hashtags"
         ]
         for key in ordered_keys:
             value = data.get(key, "N/A")
             if key in ["Tone", "Hashtags"]:
                 value = ", ".join(value) if isinstance(value, list) else value
+            para = doc.add_paragraph()
+            run = para.add_run(f"**{key}:** {value}")
+            run.font.size = Pt(11)
+        # Add a proper table for Frames if a mapping is available.
+        if "FramesMapping" in data:
+            doc.add_paragraph("Frames:")
+            mapping = data["FramesMapping"]
+            table = doc.add_table(rows=1, cols=5)
+            table.style = "Light List Accent 1"
+            hdr_cells = table.rows[0].cells
+            hdr_cells[0].text = "Frame"
+            hdr_cells[1].text = "Major Focus"
+            hdr_cells[2].text = "Significant Focus"
+            hdr_cells[3].text = "Minor Mention"
+            hdr_cells[4].text = "Not Applicable"
+            tick = "✓"
+            for frame, category in mapping.items():
+                row_cells = table.add_row().cells
+                row_cells[0].text = frame
+                row_cells[1].text = tick if category == "Major Focus" else ""
+                row_cells[2].text = tick if category == "Significant Focus" else ""
+                row_cells[3].text = tick if category == "Minor Mention" else ""
+                row_cells[4].text = tick if category == "Not Applicable" else ""
+        else:
+            value = data.get("Frames", "N/A")
+            doc.add_paragraph(f"**Frames:** {value}")
         doc.add_paragraph("\n")
     return doc
         "Tone": extract_tone(input_text),
         "Hashtags": extract_hashtags(input_text),
         "Frames": frames_table,
+        "FramesMapping": frame_mapping
     }
 if uploaded_docx:
             "Tone": extract_tone(text),
             "Hashtags": extract_hashtags(text),
             "Frames": frames_table,
+            "FramesMapping": frame_mapping
         }
 if uploaded_excel: