ahm14 commited on
Commit
d4724ab
·
verified ·
1 Parent(s): 3bcf154

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -35
app.py CHANGED
@@ -14,7 +14,6 @@ from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.prompts import ChatPromptTemplate
15
  from transformers import pipeline
16
 
17
-
18
  # Load environment variables
19
  load_dotenv()
20
 
@@ -48,7 +47,6 @@ tone_categories = {
48
  }
49
 
50
  # Frame categories for fallback method
51
-
52
  # AI-Expanded Frame Categories for More Precise Categorization
53
  # Expanded Frame Categories for Better Categorization
54
  frame_categories = {
@@ -58,15 +56,8 @@ frame_categories = {
58
  "Civil Liberties": ["freedom", "expression", "privacy", "rights violations", "censorship", "surveillance", "press freedom", "free speech", "whistleblower"],
59
  "State Repression & Human Rights Abuses": ["police brutality", "enforced disappearances", "political prisoners", "arbitrary arrests", "martial law", "crackdowns"],
60
  "Women's Rights": [
61
- "gender equality", "women's empowerment", "reproductive rights",
62
- "gender-based violence", "sexual harassment", "domestic violence",
63
- "equal pay", "education for women", "child marriage", "women's health",
64
- "maternity leave", "women in leadership", "honor killings",
65
- "karo kari", "patriarchal oppression", "honor-based violence",
66
- "marital violence", "violence against women", "justice for women",
67
- "reclaiming women's rights", "female autonomy", "societal control over women",
68
- "women's freedom of choice", "women’s bodies, women’s rights",
69
- "end honor killings", "violence against women must stop", "say no to patriarchy"]
70
  },
71
  "Political & State Accountability": {
72
  "Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency", "bribery", "misuse of power", "scandal", "nepotism", "tax fraud"],
@@ -159,8 +150,10 @@ def detect_language(text):
159
  # Extract tone using Groq API (or fallback method)
160
  def extract_tone(text):
161
  try:
162
- response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
163
- {"role": "user", "content": text}])
 
 
164
  return response["choices"][0]["message"]["content"].split(", ")
165
  except Exception as e:
166
  logging.error(f"Groq API error: {e}")
@@ -183,9 +176,7 @@ def extract_hashtags(text):
183
  def categorize_frames(frame_list):
184
  frame_counter = Counter(frame_list)
185
  categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
186
-
187
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
188
-
189
  for i, (frame, count) in enumerate(sorted_frames):
190
  if i == 0: # Highest frequency frame
191
  categorized_frames["Major Focus"].append(frame)
@@ -193,24 +184,18 @@ def categorize_frames(frame_list):
193
  categorized_frames["Significant Focus"].append(frame)
194
  else:
195
  categorized_frames["Minor Mention"].append(frame)
196
-
197
  return categorized_frames
198
 
199
  # Extract frames using keyword matching and categorize
200
  def extract_frames_fallback(text):
201
  detected_frames = []
202
  text_lower = text.lower()
203
-
204
  # Iterate through the activism topics to match keywords
205
  for main_category, subcategories in frame_categories.items():
206
  for subcategory, keywords in subcategories.items():
207
- # Check how many keywords from the subcategory are present in the text
208
  keyword_count = sum(1 for word in keywords if word in text_lower)
209
  if keyword_count > 0:
210
- # Append a tuple with main category and subcategory
211
  detected_frames.append((main_category, subcategory))
212
-
213
- # Categorize detected frames based on their frequency
214
  return categorize_frames(detected_frames)
215
 
216
  # Extract captions from DOCX
@@ -244,40 +229,78 @@ def merge_metadata_with_generated_data(generated_data, excel_metadata):
244
  if post_number in generated_data:
245
  generated_data[post_number].update(post_data)
246
  else:
247
- generated_data[post_number] = post_data
248
  return generated_data
249
 
250
  # Create DOCX file matching the uploaded format
251
  def create_docx_from_data(extracted_data):
252
  doc = Document()
253
-
254
  for post_number, data in extracted_data.items():
255
  doc.add_heading(post_number, level=1)
256
-
257
  ordered_keys = [
258
  "Post Number", "Date of Post", "Media Type", "Number of Pictures",
259
- "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
260
- "Full Caption", "Language", "Tone", "Hashtags", "Frames"
261
  ]
262
-
263
  for key in ordered_keys:
264
  value = data.get(key, "N/A")
265
-
266
  if key in ["Tone", "Hashtags"]:
267
  value = ", ".join(value) if isinstance(value, list) else value
268
  elif key == "Frames" and isinstance(value, dict):
269
- frame_text = "\n".join([f" {category}: {', '.join([' → '.join(frame) for frame in frames])}" for category, frames in value.items() if frames])
270
  value = f"\n{frame_text}" if frame_text else "N/A"
271
-
272
  doc.add_paragraph(f"**{key}:** {value}")
273
-
274
  doc.add_paragraph("\n")
 
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  return doc
277
 
278
- # Streamlit app
279
- st.title("AI-Powered Coding Sheet Generator")
 
280
 
 
281
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
282
 
283
  input_text = st.text_area("Input Text", height=200)
@@ -285,7 +308,6 @@ uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
285
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
286
 
287
  output_data = {}
288
-
289
  if input_text:
290
  output_data["Manual Input"] = {
291
  "Full Caption": input_text,
@@ -317,10 +339,20 @@ if output_data:
317
  for key, value in data.items():
318
  st.write(f"**{key}:** {value}")
319
 
320
- if output_data:
321
  docx_output = create_docx_from_data(output_data)
322
  docx_io = io.BytesIO()
323
  docx_output.save(docx_io)
324
  docx_io.seek(0)
325
  st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
326
 
 
 
 
 
 
 
 
 
 
 
 
14
  from langchain_core.prompts import ChatPromptTemplate
15
  from transformers import pipeline
16
 
 
17
  # Load environment variables
18
  load_dotenv()
19
 
 
47
  }
48
 
49
  # Frame categories for fallback method
 
50
  # AI-Expanded Frame Categories for More Precise Categorization
51
  # Expanded Frame Categories for Better Categorization
52
  frame_categories = {
 
56
  "Civil Liberties": ["freedom", "expression", "privacy", "rights violations", "censorship", "surveillance", "press freedom", "free speech", "whistleblower"],
57
  "State Repression & Human Rights Abuses": ["police brutality", "enforced disappearances", "political prisoners", "arbitrary arrests", "martial law", "crackdowns"],
58
  "Women's Rights": [
59
+ "gender equality", "women's empowerment", "reproductive rights", "gender-based violence", "sexual harassment", "domestic violence", "equal pay", "education for women", "child marriage", "women's health", "maternity leave", "women in leadership", "honor killings", "karo kari", "patriarchal oppression", "honor-based violence", "marital violence", "violence against women", "justice for women", "reclaiming women's rights", "female autonomy", "societal control over women", "women's freedom of choice", "women’s bodies, women’s rights", "end honor killings", "violence against women must stop", "say no to patriarchy"
60
+ ]
 
 
 
 
 
 
 
61
  },
62
  "Political & State Accountability": {
63
  "Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency", "bribery", "misuse of power", "scandal", "nepotism", "tax fraud"],
 
150
  # Extract tone using Groq API (or fallback method)
151
  def extract_tone(text):
152
  try:
153
+ response = llm.chat([
154
+ {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
155
+ {"role": "user", "content": text}
156
+ ])
157
  return response["choices"][0]["message"]["content"].split(", ")
158
  except Exception as e:
159
  logging.error(f"Groq API error: {e}")
 
176
  def categorize_frames(frame_list):
177
  frame_counter = Counter(frame_list)
178
  categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
 
179
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
 
180
  for i, (frame, count) in enumerate(sorted_frames):
181
  if i == 0: # Highest frequency frame
182
  categorized_frames["Major Focus"].append(frame)
 
184
  categorized_frames["Significant Focus"].append(frame)
185
  else:
186
  categorized_frames["Minor Mention"].append(frame)
 
187
  return categorized_frames
188
 
189
  # Extract frames using keyword matching and categorize
190
  def extract_frames_fallback(text):
191
  detected_frames = []
192
  text_lower = text.lower()
 
193
  # Iterate through the activism topics to match keywords
194
  for main_category, subcategories in frame_categories.items():
195
  for subcategory, keywords in subcategories.items():
 
196
  keyword_count = sum(1 for word in keywords if word in text_lower)
197
  if keyword_count > 0:
 
198
  detected_frames.append((main_category, subcategory))
 
 
199
  return categorize_frames(detected_frames)
200
 
201
  # Extract captions from DOCX
 
229
  if post_number in generated_data:
230
  generated_data[post_number].update(post_data)
231
  else:
232
+ generated_data[post_number] = post_data
233
  return generated_data
234
 
235
  # Create DOCX file matching the uploaded format
236
  def create_docx_from_data(extracted_data):
237
  doc = Document()
 
238
  for post_number, data in extracted_data.items():
239
  doc.add_heading(post_number, level=1)
 
240
  ordered_keys = [
241
  "Post Number", "Date of Post", "Media Type", "Number of Pictures",
242
+ "Number of Videos", "Number of Audios", "Likes", "Comments",
243
+ "Tagged Audience", "Full Caption", "Language", "Tone", "Hashtags", "Frames"
244
  ]
 
245
  for key in ordered_keys:
246
  value = data.get(key, "N/A")
 
247
  if key in ["Tone", "Hashtags"]:
248
  value = ", ".join(value) if isinstance(value, list) else value
249
  elif key == "Frames" and isinstance(value, dict):
250
+ frame_text = "\n".join([f" {category}: {', '.join([' → '.join(frame) for frame in frames])}" for category, frames in value.items() if frames])
251
  value = f"\n{frame_text}" if frame_text else "N/A"
 
252
  doc.add_paragraph(f"**{key}:** {value}")
 
253
  doc.add_paragraph("\n")
254
+ return doc
255
 
256
+ # --------------------------
257
+ # New functions for Frame Analysis
258
+ # --------------------------
259
+
260
+ # Aggregate frames from all posts into a simple dictionary (Frame 1: category, etc.)
261
+ def aggregate_frames(output_data):
262
+ aggregated = {}
263
+ counter = 1
264
+ for post_data in output_data.values():
265
+ frames = post_data.get("Frames")
266
+ if frames and isinstance(frames, dict):
267
+ for category in ["Major Focus", "Significant Focus", "Minor Mention"]:
268
+ if category in frames and frames[category]:
269
+ for frame in frames[category]:
270
+ if isinstance(frame, tuple):
271
+ frame_str = " → ".join(frame)
272
+ else:
273
+ frame_str = str(frame)
274
+ aggregated[f"Frame {counter}"] = category
275
+ counter += 1
276
+ return aggregated
277
+
278
+ # Create a DOCX file for frame analysis with a table
279
+ def create_frame_analysis_docx(frames_data):
280
+ doc = Document()
281
+ doc.add_heading("Frame Analysis", level=1)
282
+ table = doc.add_table(rows=1, cols=5)
283
+ table.style = 'Table Grid'
284
+ hdr_cells = table.rows[0].cells
285
+ hdr_cells[0].text = "Frame"
286
+ hdr_cells[1].text = "Major Focus"
287
+ hdr_cells[2].text = "Significant Focus"
288
+ hdr_cells[3].text = "Minor Mention"
289
+ hdr_cells[4].text = "Not Applicable"
290
+ for frame, category in frames_data.items():
291
+ row_cells = table.add_row().cells
292
+ row_cells[0].text = frame
293
+ row_cells[1].text = "✔ Major Focus" if category == "Major Focus" else "Major Focus"
294
+ row_cells[2].text = "✔ Significant Focus" if category == "Significant Focus" else "Significant Focus"
295
+ row_cells[3].text = "✔ Minor Mention" if category == "Minor Mention" else "Minor Mention"
296
+ row_cells[4].text = "✔ Not Applicable" if category == "Not Applicable" else "Not Applicable"
297
  return doc
298
 
299
+ # --------------------------
300
+ # Streamlit App
301
+ # --------------------------
302
 
303
+ st.title("AI-Powered Coding Sheet Generator")
304
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
305
 
306
  input_text = st.text_area("Input Text", height=200)
 
308
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
309
 
310
  output_data = {}
 
311
  if input_text:
312
  output_data["Manual Input"] = {
313
  "Full Caption": input_text,
 
339
  for key, value in data.items():
340
  st.write(f"**{key}:** {value}")
341
 
342
+ # Create and offer download for merged analysis DOCX
343
  docx_output = create_docx_from_data(output_data)
344
  docx_io = io.BytesIO()
345
  docx_output.save(docx_io)
346
  docx_io.seek(0)
347
  st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
348
 
349
+ # Aggregate frames and create frame analysis DOCX
350
+ frames_data = aggregate_frames(output_data)
351
+ if frames_data:
352
+ frame_docx = create_frame_analysis_docx(frames_data)
353
+ frame_docx_io = io.BytesIO()
354
+ frame_docx.save(frame_docx_io)
355
+ frame_docx_io.seek(0)
356
+ st.download_button("Download Frame Analysis DOCX", data=frame_docx_io, file_name="frame_analysis.docx")
357
+
358
+