Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import re
|
|
5 |
import logging
|
6 |
import nltk
|
7 |
from docx import Document
|
|
|
8 |
import io
|
9 |
from langdetect import detect
|
10 |
from collections import Counter
|
@@ -46,7 +47,7 @@ tone_categories = {
|
|
46 |
"Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
|
47 |
}
|
48 |
|
49 |
-
# Frame categories for
|
50 |
frame_categories = {
|
51 |
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
|
52 |
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
|
@@ -65,8 +66,15 @@ frame_categories = {
|
|
65 |
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
|
66 |
}
|
67 |
|
68 |
-
# Initialize zero-shot classifier
|
69 |
-
classifier = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
|
71 |
|
72 |
def detect_language(text):
|
@@ -78,8 +86,10 @@ def detect_language(text):
|
|
78 |
|
79 |
def extract_tone(text):
|
80 |
try:
|
81 |
-
response = llm.chat([
|
82 |
-
|
|
|
|
|
83 |
return response["choices"][0]["message"]["content"].split(", ")
|
84 |
except Exception as e:
|
85 |
logging.error(f"Groq API error: {e}")
|
@@ -97,33 +107,42 @@ def extract_hashtags(text):
|
|
97 |
return re.findall(r"#\w+", text)
|
98 |
|
99 |
# -------------------------------------------------------------------
|
100 |
-
# New functions for qualitative frame categorization
|
101 |
# -------------------------------------------------------------------
|
102 |
|
103 |
def get_frame_category_mapping(text):
|
104 |
"""
|
105 |
-
For each frame
|
106 |
-
approach to qualitatively
|
107 |
-
|
108 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
mapping = {}
|
110 |
-
for frame in
|
111 |
-
|
112 |
-
result = classifier(text, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template)
|
113 |
-
best_label = result["labels"][0] # select the highest scoring label
|
114 |
-
mapping[frame] = best_label
|
115 |
return mapping
|
116 |
|
117 |
def format_frame_categories_table(mapping):
|
118 |
"""
|
119 |
-
Returns a markdown-formatted table
|
120 |
Major Focus, Significant Focus, Minor Mention, and Not Applicable.
|
121 |
-
A tick (✓)
|
122 |
"""
|
123 |
header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
|
124 |
header += "| --- | --- | --- | --- | --- |\n"
|
125 |
-
rows = ""
|
126 |
tick = "✓"
|
|
|
127 |
for frame, category in mapping.items():
|
128 |
major = tick if category == "Major Focus" else ""
|
129 |
significant = tick if category == "Significant Focus" else ""
|
@@ -174,13 +193,38 @@ def create_docx_from_data(extracted_data):
|
|
174 |
ordered_keys = [
|
175 |
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
|
176 |
"Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
|
177 |
-
"Full Caption", "Language", "Tone", "Hashtags"
|
178 |
]
|
179 |
for key in ordered_keys:
|
180 |
value = data.get(key, "N/A")
|
181 |
if key in ["Tone", "Hashtags"]:
|
182 |
value = ", ".join(value) if isinstance(value, list) else value
|
183 |
-
doc.add_paragraph(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
doc.add_paragraph("\n")
|
185 |
return doc
|
186 |
|
@@ -206,6 +250,7 @@ if input_text:
|
|
206 |
"Tone": extract_tone(input_text),
|
207 |
"Hashtags": extract_hashtags(input_text),
|
208 |
"Frames": frames_table,
|
|
|
209 |
}
|
210 |
|
211 |
if uploaded_docx:
|
@@ -219,6 +264,7 @@ if uploaded_docx:
|
|
219 |
"Tone": extract_tone(text),
|
220 |
"Hashtags": extract_hashtags(text),
|
221 |
"Frames": frames_table,
|
|
|
222 |
}
|
223 |
|
224 |
if uploaded_excel:
|
|
|
5 |
import logging
|
6 |
import nltk
|
7 |
from docx import Document
|
8 |
+
from docx.shared import Pt
|
9 |
import io
|
10 |
from langdetect import detect
|
11 |
from collections import Counter
|
|
|
47 |
"Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
|
48 |
}
|
49 |
|
50 |
+
# Frame categories for qualitative analysis
|
51 |
frame_categories = {
|
52 |
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
|
53 |
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
|
|
|
66 |
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
|
67 |
}
|
68 |
|
69 |
+
# Initialize zero-shot classifier with a distilled model, GPU and half precision
|
70 |
+
classifier = pipeline(
|
71 |
+
"zero-shot-classification",
|
72 |
+
model="valhalla/distilbart-mnli-12-3", # distilled and faster model
|
73 |
+
device=0, # use GPU if available
|
74 |
+
torch_dtype="auto", # or use torch.float16 if you want to force fp16
|
75 |
+
batch_size=16 # adjust batch_size based on your GPU memory
|
76 |
+
)
|
77 |
+
# Candidate labels for qualitative categorization (order matters only for display)
|
78 |
candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
|
79 |
|
80 |
def detect_language(text):
|
|
|
86 |
|
87 |
def extract_tone(text):
|
88 |
try:
|
89 |
+
response = llm.chat([
|
90 |
+
{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
|
91 |
+
{"role": "user", "content": text}
|
92 |
+
])
|
93 |
return response["choices"][0]["message"]["content"].split(", ")
|
94 |
except Exception as e:
|
95 |
logging.error(f"Groq API error: {e}")
|
|
|
107 |
return re.findall(r"#\w+", text)
|
108 |
|
109 |
# -------------------------------------------------------------------
|
110 |
+
# New functions for qualitative frame categorization (batched version)
|
111 |
# -------------------------------------------------------------------
|
112 |
|
113 |
def get_frame_category_mapping(text):
|
114 |
"""
|
115 |
+
For each frame defined in frame_categories, this function uses a zero-shot classification
|
116 |
+
approach to assess qualitatively how strongly the text discusses the frame.
|
117 |
+
It builds a batched list of hypothesis templates and returns a mapping from frame to the best label.
|
118 |
"""
|
119 |
+
frames = list(frame_categories.keys())
|
120 |
+
# Build hypothesis templates—one per frame:
|
121 |
+
hypothesis_templates = [f"This text is {{}} about {frame}." for frame in frames]
|
122 |
+
# Repeat the same input text for each frame in the batch.
|
123 |
+
texts = [text] * len(frames)
|
124 |
+
# Batch call: each element of the batch uses its own hypothesis_template.
|
125 |
+
results = classifier(
|
126 |
+
texts,
|
127 |
+
candidate_labels=candidate_labels,
|
128 |
+
hypothesis_template=hypothesis_templates,
|
129 |
+
batch_size=len(frames) # process all at once
|
130 |
+
)
|
131 |
mapping = {}
|
132 |
+
for frame, result in zip(frames, results):
|
133 |
+
mapping[frame] = result["labels"][0]
|
|
|
|
|
|
|
134 |
return mapping
|
135 |
|
136 |
def format_frame_categories_table(mapping):
|
137 |
"""
|
138 |
+
Returns a markdown-formatted table displaying each frame with columns:
|
139 |
Major Focus, Significant Focus, Minor Mention, and Not Applicable.
|
140 |
+
A tick (✓) marks the assigned category.
|
141 |
"""
|
142 |
header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
|
143 |
header += "| --- | --- | --- | --- | --- |\n"
|
|
|
144 |
tick = "✓"
|
145 |
+
rows = ""
|
146 |
for frame, category in mapping.items():
|
147 |
major = tick if category == "Major Focus" else ""
|
148 |
significant = tick if category == "Significant Focus" else ""
|
|
|
193 |
ordered_keys = [
|
194 |
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
|
195 |
"Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
|
196 |
+
"Full Caption", "Language", "Tone", "Hashtags"
|
197 |
]
|
198 |
for key in ordered_keys:
|
199 |
value = data.get(key, "N/A")
|
200 |
if key in ["Tone", "Hashtags"]:
|
201 |
value = ", ".join(value) if isinstance(value, list) else value
|
202 |
+
para = doc.add_paragraph()
|
203 |
+
run = para.add_run(f"**{key}:** {value}")
|
204 |
+
run.font.size = Pt(11)
|
205 |
+
# Add a proper table for Frames if a mapping is available.
|
206 |
+
if "FramesMapping" in data:
|
207 |
+
doc.add_paragraph("Frames:")
|
208 |
+
mapping = data["FramesMapping"]
|
209 |
+
table = doc.add_table(rows=1, cols=5)
|
210 |
+
table.style = "Light List Accent 1"
|
211 |
+
hdr_cells = table.rows[0].cells
|
212 |
+
hdr_cells[0].text = "Frame"
|
213 |
+
hdr_cells[1].text = "Major Focus"
|
214 |
+
hdr_cells[2].text = "Significant Focus"
|
215 |
+
hdr_cells[3].text = "Minor Mention"
|
216 |
+
hdr_cells[4].text = "Not Applicable"
|
217 |
+
tick = "✓"
|
218 |
+
for frame, category in mapping.items():
|
219 |
+
row_cells = table.add_row().cells
|
220 |
+
row_cells[0].text = frame
|
221 |
+
row_cells[1].text = tick if category == "Major Focus" else ""
|
222 |
+
row_cells[2].text = tick if category == "Significant Focus" else ""
|
223 |
+
row_cells[3].text = tick if category == "Minor Mention" else ""
|
224 |
+
row_cells[4].text = tick if category == "Not Applicable" else ""
|
225 |
+
else:
|
226 |
+
value = data.get("Frames", "N/A")
|
227 |
+
doc.add_paragraph(f"**Frames:** {value}")
|
228 |
doc.add_paragraph("\n")
|
229 |
return doc
|
230 |
|
|
|
250 |
"Tone": extract_tone(input_text),
|
251 |
"Hashtags": extract_hashtags(input_text),
|
252 |
"Frames": frames_table,
|
253 |
+
"FramesMapping": frame_mapping
|
254 |
}
|
255 |
|
256 |
if uploaded_docx:
|
|
|
264 |
"Tone": extract_tone(text),
|
265 |
"Hashtags": extract_hashtags(text),
|
266 |
"Frames": frames_table,
|
267 |
+
"FramesMapping": frame_mapping
|
268 |
}
|
269 |
|
270 |
if uploaded_excel:
|