Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,6 @@ from langchain_core.output_parsers import StrOutputParser
|
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
from transformers import pipeline
|
16 |
|
17 |
-
|
18 |
# Load environment variables
|
19 |
load_dotenv()
|
20 |
|
@@ -48,7 +47,6 @@ tone_categories = {
|
|
48 |
}
|
49 |
|
50 |
# Frame categories for fallback method
|
51 |
-
|
52 |
# AI-Expanded Frame Categories for More Precise Categorization
|
53 |
# Expanded Frame Categories for Better Categorization
|
54 |
frame_categories = {
|
@@ -58,15 +56,8 @@ frame_categories = {
|
|
58 |
"Civil Liberties": ["freedom", "expression", "privacy", "rights violations", "censorship", "surveillance", "press freedom", "free speech", "whistleblower"],
|
59 |
"State Repression & Human Rights Abuses": ["police brutality", "enforced disappearances", "political prisoners", "arbitrary arrests", "martial law", "crackdowns"],
|
60 |
"Women's Rights": [
|
61 |
-
|
62 |
-
|
63 |
-
"equal pay", "education for women", "child marriage", "women's health",
|
64 |
-
"maternity leave", "women in leadership", "honor killings",
|
65 |
-
"karo kari", "patriarchal oppression", "honor-based violence",
|
66 |
-
"marital violence", "violence against women", "justice for women",
|
67 |
-
"reclaiming women's rights", "female autonomy", "societal control over women",
|
68 |
-
"women's freedom of choice", "women’s bodies, women’s rights",
|
69 |
-
"end honor killings", "violence against women must stop", "say no to patriarchy"]
|
70 |
},
|
71 |
"Political & State Accountability": {
|
72 |
"Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency", "bribery", "misuse of power", "scandal", "nepotism", "tax fraud"],
|
@@ -159,8 +150,10 @@ def detect_language(text):
|
|
159 |
# Extract tone using Groq API (or fallback method)
|
160 |
def extract_tone(text):
|
161 |
try:
|
162 |
-
response = llm.chat([
|
163 |
-
|
|
|
|
|
164 |
return response["choices"][0]["message"]["content"].split(", ")
|
165 |
except Exception as e:
|
166 |
logging.error(f"Groq API error: {e}")
|
@@ -183,9 +176,7 @@ def extract_hashtags(text):
|
|
183 |
def categorize_frames(frame_list):
|
184 |
frame_counter = Counter(frame_list)
|
185 |
categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
|
186 |
-
|
187 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
188 |
-
|
189 |
for i, (frame, count) in enumerate(sorted_frames):
|
190 |
if i == 0: # Highest frequency frame
|
191 |
categorized_frames["Major Focus"].append(frame)
|
@@ -193,24 +184,18 @@ def categorize_frames(frame_list):
|
|
193 |
categorized_frames["Significant Focus"].append(frame)
|
194 |
else:
|
195 |
categorized_frames["Minor Mention"].append(frame)
|
196 |
-
|
197 |
return categorized_frames
|
198 |
|
199 |
# Extract frames using keyword matching and categorize
|
200 |
def extract_frames_fallback(text):
|
201 |
detected_frames = []
|
202 |
text_lower = text.lower()
|
203 |
-
|
204 |
# Iterate through the activism topics to match keywords
|
205 |
for main_category, subcategories in frame_categories.items():
|
206 |
for subcategory, keywords in subcategories.items():
|
207 |
-
# Check how many keywords from the subcategory are present in the text
|
208 |
keyword_count = sum(1 for word in keywords if word in text_lower)
|
209 |
if keyword_count > 0:
|
210 |
-
# Append a tuple with main category and subcategory
|
211 |
detected_frames.append((main_category, subcategory))
|
212 |
-
|
213 |
-
# Categorize detected frames based on their frequency
|
214 |
return categorize_frames(detected_frames)
|
215 |
|
216 |
# Extract captions from DOCX
|
@@ -244,40 +229,78 @@ def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
|
244 |
if post_number in generated_data:
|
245 |
generated_data[post_number].update(post_data)
|
246 |
else:
|
247 |
-
generated_data[post_number] = post_data
|
248 |
return generated_data
|
249 |
|
250 |
# Create DOCX file matching the uploaded format
|
251 |
def create_docx_from_data(extracted_data):
|
252 |
doc = Document()
|
253 |
-
|
254 |
for post_number, data in extracted_data.items():
|
255 |
doc.add_heading(post_number, level=1)
|
256 |
-
|
257 |
ordered_keys = [
|
258 |
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
|
259 |
-
"Number of Videos", "Number of Audios", "Likes", "Comments",
|
260 |
-
"Full Caption", "Language", "Tone", "Hashtags", "Frames"
|
261 |
]
|
262 |
-
|
263 |
for key in ordered_keys:
|
264 |
value = data.get(key, "N/A")
|
265 |
-
|
266 |
if key in ["Tone", "Hashtags"]:
|
267 |
value = ", ".join(value) if isinstance(value, list) else value
|
268 |
elif key == "Frames" and isinstance(value, dict):
|
269 |
-
frame_text = "\n".join([f"
|
270 |
value = f"\n{frame_text}" if frame_text else "N/A"
|
271 |
-
|
272 |
doc.add_paragraph(f"**{key}:** {value}")
|
273 |
-
|
274 |
doc.add_paragraph("\n")
|
|
|
275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
return doc
|
277 |
|
278 |
-
#
|
279 |
-
|
|
|
280 |
|
|
|
281 |
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
282 |
|
283 |
input_text = st.text_area("Input Text", height=200)
|
@@ -285,7 +308,6 @@ uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
|
|
285 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
286 |
|
287 |
output_data = {}
|
288 |
-
|
289 |
if input_text:
|
290 |
output_data["Manual Input"] = {
|
291 |
"Full Caption": input_text,
|
@@ -317,10 +339,20 @@ if output_data:
|
|
317 |
for key, value in data.items():
|
318 |
st.write(f"**{key}:** {value}")
|
319 |
|
320 |
-
|
321 |
docx_output = create_docx_from_data(output_data)
|
322 |
docx_io = io.BytesIO()
|
323 |
docx_output.save(docx_io)
|
324 |
docx_io.seek(0)
|
325 |
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
|
326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
from transformers import pipeline
|
16 |
|
|
|
17 |
# Load environment variables
|
18 |
load_dotenv()
|
19 |
|
|
|
47 |
}
|
48 |
|
49 |
# Frame categories for fallback method
|
|
|
50 |
# AI-Expanded Frame Categories for More Precise Categorization
|
51 |
# Expanded Frame Categories for Better Categorization
|
52 |
frame_categories = {
|
|
|
56 |
"Civil Liberties": ["freedom", "expression", "privacy", "rights violations", "censorship", "surveillance", "press freedom", "free speech", "whistleblower"],
|
57 |
"State Repression & Human Rights Abuses": ["police brutality", "enforced disappearances", "political prisoners", "arbitrary arrests", "martial law", "crackdowns"],
|
58 |
"Women's Rights": [
|
59 |
+
"gender equality", "women's empowerment", "reproductive rights", "gender-based violence", "sexual harassment", "domestic violence", "equal pay", "education for women", "child marriage", "women's health", "maternity leave", "women in leadership", "honor killings", "karo kari", "patriarchal oppression", "honor-based violence", "marital violence", "violence against women", "justice for women", "reclaiming women's rights", "female autonomy", "societal control over women", "women's freedom of choice", "women’s bodies, women’s rights", "end honor killings", "violence against women must stop", "say no to patriarchy"
|
60 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
},
|
62 |
"Political & State Accountability": {
|
63 |
"Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency", "bribery", "misuse of power", "scandal", "nepotism", "tax fraud"],
|
|
|
150 |
# Extract tone using Groq API (or fallback method)
|
151 |
def extract_tone(text):
|
152 |
try:
|
153 |
+
response = llm.chat([
|
154 |
+
{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
|
155 |
+
{"role": "user", "content": text}
|
156 |
+
])
|
157 |
return response["choices"][0]["message"]["content"].split(", ")
|
158 |
except Exception as e:
|
159 |
logging.error(f"Groq API error: {e}")
|
|
|
176 |
def categorize_frames(frame_list):
|
177 |
frame_counter = Counter(frame_list)
|
178 |
categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
|
|
|
179 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
|
|
180 |
for i, (frame, count) in enumerate(sorted_frames):
|
181 |
if i == 0: # Highest frequency frame
|
182 |
categorized_frames["Major Focus"].append(frame)
|
|
|
184 |
categorized_frames["Significant Focus"].append(frame)
|
185 |
else:
|
186 |
categorized_frames["Minor Mention"].append(frame)
|
|
|
187 |
return categorized_frames
|
188 |
|
189 |
# Extract frames using keyword matching and categorize
|
190 |
def extract_frames_fallback(text):
|
191 |
detected_frames = []
|
192 |
text_lower = text.lower()
|
|
|
193 |
# Iterate through the activism topics to match keywords
|
194 |
for main_category, subcategories in frame_categories.items():
|
195 |
for subcategory, keywords in subcategories.items():
|
|
|
196 |
keyword_count = sum(1 for word in keywords if word in text_lower)
|
197 |
if keyword_count > 0:
|
|
|
198 |
detected_frames.append((main_category, subcategory))
|
|
|
|
|
199 |
return categorize_frames(detected_frames)
|
200 |
|
201 |
# Extract captions from DOCX
|
|
|
229 |
if post_number in generated_data:
|
230 |
generated_data[post_number].update(post_data)
|
231 |
else:
|
232 |
+
generated_data[post_number] = post_data
|
233 |
return generated_data
|
234 |
|
235 |
# Create DOCX file matching the uploaded format
|
236 |
def create_docx_from_data(extracted_data):
|
237 |
doc = Document()
|
|
|
238 |
for post_number, data in extracted_data.items():
|
239 |
doc.add_heading(post_number, level=1)
|
|
|
240 |
ordered_keys = [
|
241 |
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
|
242 |
+
"Number of Videos", "Number of Audios", "Likes", "Comments",
|
243 |
+
"Tagged Audience", "Full Caption", "Language", "Tone", "Hashtags", "Frames"
|
244 |
]
|
|
|
245 |
for key in ordered_keys:
|
246 |
value = data.get(key, "N/A")
|
|
|
247 |
if key in ["Tone", "Hashtags"]:
|
248 |
value = ", ".join(value) if isinstance(value, list) else value
|
249 |
elif key == "Frames" and isinstance(value, dict):
|
250 |
+
frame_text = "\n".join([f" {category}: {', '.join([' → '.join(frame) for frame in frames])}" for category, frames in value.items() if frames])
|
251 |
value = f"\n{frame_text}" if frame_text else "N/A"
|
|
|
252 |
doc.add_paragraph(f"**{key}:** {value}")
|
|
|
253 |
doc.add_paragraph("\n")
|
254 |
+
return doc
|
255 |
|
256 |
+
# --------------------------
|
257 |
+
# New functions for Frame Analysis
|
258 |
+
# --------------------------
|
259 |
+
|
260 |
+
# Aggregate frames from all posts into a simple dictionary (Frame 1: category, etc.)
|
261 |
+
def aggregate_frames(output_data):
|
262 |
+
aggregated = {}
|
263 |
+
counter = 1
|
264 |
+
for post_data in output_data.values():
|
265 |
+
frames = post_data.get("Frames")
|
266 |
+
if frames and isinstance(frames, dict):
|
267 |
+
for category in ["Major Focus", "Significant Focus", "Minor Mention"]:
|
268 |
+
if category in frames and frames[category]:
|
269 |
+
for frame in frames[category]:
|
270 |
+
if isinstance(frame, tuple):
|
271 |
+
frame_str = " → ".join(frame)
|
272 |
+
else:
|
273 |
+
frame_str = str(frame)
|
274 |
+
aggregated[f"Frame {counter}"] = category
|
275 |
+
counter += 1
|
276 |
+
return aggregated
|
277 |
+
|
278 |
+
# Create a DOCX file for frame analysis with a table
|
279 |
+
def create_frame_analysis_docx(frames_data):
|
280 |
+
doc = Document()
|
281 |
+
doc.add_heading("Frame Analysis", level=1)
|
282 |
+
table = doc.add_table(rows=1, cols=5)
|
283 |
+
table.style = 'Table Grid'
|
284 |
+
hdr_cells = table.rows[0].cells
|
285 |
+
hdr_cells[0].text = "Frame"
|
286 |
+
hdr_cells[1].text = "Major Focus"
|
287 |
+
hdr_cells[2].text = "Significant Focus"
|
288 |
+
hdr_cells[3].text = "Minor Mention"
|
289 |
+
hdr_cells[4].text = "Not Applicable"
|
290 |
+
for frame, category in frames_data.items():
|
291 |
+
row_cells = table.add_row().cells
|
292 |
+
row_cells[0].text = frame
|
293 |
+
row_cells[1].text = "✔ Major Focus" if category == "Major Focus" else "Major Focus"
|
294 |
+
row_cells[2].text = "✔ Significant Focus" if category == "Significant Focus" else "Significant Focus"
|
295 |
+
row_cells[3].text = "✔ Minor Mention" if category == "Minor Mention" else "Minor Mention"
|
296 |
+
row_cells[4].text = "✔ Not Applicable" if category == "Not Applicable" else "Not Applicable"
|
297 |
return doc
|
298 |
|
299 |
+
# --------------------------
|
300 |
+
# Streamlit App
|
301 |
+
# --------------------------
|
302 |
|
303 |
+
st.title("AI-Powered Coding Sheet Generator")
|
304 |
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
305 |
|
306 |
input_text = st.text_area("Input Text", height=200)
|
|
|
308 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
309 |
|
310 |
output_data = {}
|
|
|
311 |
if input_text:
|
312 |
output_data["Manual Input"] = {
|
313 |
"Full Caption": input_text,
|
|
|
339 |
for key, value in data.items():
|
340 |
st.write(f"**{key}:** {value}")
|
341 |
|
342 |
+
# Create and offer download for merged analysis DOCX
|
343 |
docx_output = create_docx_from_data(output_data)
|
344 |
docx_io = io.BytesIO()
|
345 |
docx_output.save(docx_io)
|
346 |
docx_io.seek(0)
|
347 |
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
|
348 |
|
349 |
+
# Aggregate frames and create frame analysis DOCX
|
350 |
+
frames_data = aggregate_frames(output_data)
|
351 |
+
if frames_data:
|
352 |
+
frame_docx = create_frame_analysis_docx(frames_data)
|
353 |
+
frame_docx_io = io.BytesIO()
|
354 |
+
frame_docx.save(frame_docx_io)
|
355 |
+
frame_docx_io.seek(0)
|
356 |
+
st.download_button("Download Frame Analysis DOCX", data=frame_docx_io, file_name="frame_analysis.docx")
|
357 |
+
|
358 |
+
|