ahm14 commited on
Commit
da716d7
·
verified ·
1 Parent(s): 442e623

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -78
app.py CHANGED
@@ -33,7 +33,7 @@ llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b
33
  # Download required NLTK resources
34
  nltk.download("punkt")
35
 
36
- # Updated tone categories
37
  tone_categories = {
38
  "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
39
  "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
@@ -41,32 +41,92 @@ tone_categories = {
41
  "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
42
  "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
43
  "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
44
- "Happy": ["joy", "celebration", "cheer", "success", "smile", "gratitude", "harmony"],
45
  "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
46
  "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
47
  "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
48
  "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
49
  }
50
 
51
- # Updated frame categories
 
52
  frame_categories = {
53
- "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
54
- "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
55
- "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
56
- "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
57
- "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
58
- "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
59
- "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
60
- "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
61
- "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
62
- "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
63
- "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
64
- "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
65
- "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
66
- "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
67
- "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
 
70
  # Detect language
71
  def detect_language(text):
72
  try:
@@ -98,10 +158,6 @@ def extract_tone_fallback(text):
98
  def extract_hashtags(text):
99
  return re.findall(r"#\w+", text)
100
 
101
- # Extract hashtags
102
- def extract_hashtags(text):
103
- return re.findall(r"#\w+", text)
104
-
105
  # Categorize frames into Major, Significant, and Minor based on frequency
106
  def categorize_frames(frame_list):
107
  frame_counter = Counter(frame_list)
@@ -110,9 +166,9 @@ def categorize_frames(frame_list):
110
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
111
 
112
  for i, (frame, count) in enumerate(sorted_frames):
113
- if i == 0:
114
  categorized_frames["Major Focus"].append(frame)
115
- elif i < 3:
116
  categorized_frames["Significant Focus"].append(frame)
117
  else:
118
  categorized_frames["Minor Mention"].append(frame)
@@ -120,27 +176,22 @@ def categorize_frames(frame_list):
120
  return categorized_frames
121
 
122
  # Extract frames using keyword matching and categorize
123
- def extract_frames_fallback(text):
124
  detected_frames = []
125
  text_lower = text.lower()
126
 
127
- for category, keywords in frame_categories.items():
128
- keyword_count = sum(1 for word in keywords if word in text_lower)
129
- if keyword_count > 0:
130
- detected_frames.append(category)
 
 
 
 
131
 
 
132
  return categorize_frames(detected_frames)
133
 
134
- # Extract metadata from Excel file
135
- def extract_metadata_from_excel(excel_file):
136
- try:
137
- df = pd.read_excel(excel_file)
138
- extracted_data = df.to_dict(orient="records")
139
- return extracted_data
140
- except Exception as e:
141
- logging.error(f"Error processing Excel file: {e}")
142
- return []
143
-
144
  # Extract captions from DOCX
145
  def extract_captions_from_docx(docx_file):
146
  doc = Document(docx_file)
@@ -155,51 +206,49 @@ def extract_captions_from_docx(docx_file):
155
  captions[current_post].append(text)
156
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
157
 
158
- # Merge metadata and captions together
159
- def merge_metadata_with_captions(metadata, captions):
160
- merged_data = []
161
- for i, meta in enumerate(metadata):
162
- post_number = f"Post {i+1}"
163
- caption_text = captions.get(post_number, "No caption available")
164
-
165
- post_data = meta.copy()
166
- post_data["Full Caption"] = caption_text
167
- post_data["Language"] = detect_language(caption_text)
168
- post_data["Tone"] = extract_tone(caption_text)
169
- post_data["Hashtags"] = extract_hashtags(caption_text)
170
- post_data["Frames"] = extract_frames_fallback(caption_text)
171
-
172
- merged_data.append(post_data)
173
 
174
- return merged_data
 
 
 
 
 
 
 
 
175
 
176
- # Create DOCX file with correct formatting
177
  def create_docx_from_data(extracted_data):
178
  doc = Document()
179
 
180
- for index, data in enumerate(extracted_data, start=1):
181
- doc.add_heading(f"Sr No {index}:", level=1)
182
 
183
- metadata_fields = [
184
- "Date of Post", "Media Type", "Number of Pictures", "Number of Videos",
185
- "Number of Audios", "Likes", "Comments", "Tagged Audience"
 
186
  ]
187
 
188
- for field in metadata_fields:
189
- value = data.get(field, "N/A")
190
- doc.add_paragraph(f"**{field}:** {value}")
191
 
192
- doc.add_paragraph(f"**Caption:** {data.get('Full Caption', 'N/A')}")
 
 
 
 
193
 
194
- doc.add_paragraph(f"**Language:** {data.get('Language', 'N/A')}")
195
- doc.add_paragraph(f"**Tone:** {', '.join(data.get('Tone', ['N/A']))}")
196
- doc.add_paragraph(f"**Hashtags:** {', '.join(data.get('Hashtags', []))}")
197
-
198
- frames = data.get("Frames", {})
199
- doc.add_paragraph("**Frames:**")
200
- for category, frame_list in frames.items():
201
- if frame_list:
202
- doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
203
 
204
  doc.add_paragraph("\n")
205
 
@@ -208,17 +257,49 @@ def create_docx_from_data(extracted_data):
208
  # Streamlit app
209
  st.title("AI-Powered Activism Message Analyzer")
210
 
 
 
 
211
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
212
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
213
 
214
- if uploaded_excel and uploaded_docx:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  excel_metadata = extract_metadata_from_excel(uploaded_excel)
216
- docx_captions = extract_captions_from_docx(uploaded_docx)
217
 
218
- merged_data = merge_metadata_with_captions(excel_metadata, docx_captions)
219
- docx_output = create_docx_from_data(merged_data)
 
 
 
 
220
 
 
 
221
  docx_io = io.BytesIO()
222
  docx_output.save(docx_io)
223
  docx_io.seek(0)
224
  st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")
 
 
33
  # Download required NLTK resources
34
  nltk.download("punkt")
35
 
36
+ # Tone categories for fallback method
37
  tone_categories = {
38
  "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
39
  "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
 
41
  "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
42
  "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
43
  "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
 
44
  "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
45
  "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
46
  "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
47
  "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
48
  }
49
 
50
+ # Frame categories for fallback method
51
+
52
  frame_categories = {
53
+ "Human Rights & Justice": {
54
+ "Legal Rights & Reforms": ["law", "justice", "legal", "reforms", "legislation"],
55
+ "Humanitarian Issues": ["humanitarian", "aid", "refugees", "asylum", "crisis response"],
56
+ "Civil Liberties": ["freedom", "expression", "privacy", "rights violations"]
57
+ },
58
+ "Political & State Accountability": {
59
+ "Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency"],
60
+ "Political Oppression": ["authoritarianism", "censorship", "state control", "dissent", "crackdown"],
61
+ "Elections & Political Representation": ["voting", "elections", "political participation", "democracy"]
62
+ },
63
+ "Gender & Patriarchy": {
64
+ "Gender-Based Violence": ["violence", "domestic abuse", "sexual harassment", "femicide"],
65
+ "Women's Rights & Equality": ["gender equality", "feminism", "reproductive rights", "patriarchy"],
66
+ "LGBTQ+ Rights": ["queer rights", "LGBTQ+", "gender identity", "trans rights", "homophobia"]
67
+ },
68
+ "Religious Freedom & Persecution": {
69
+ "Religious Discrimination": ["persecution", "intolerance", "sectarianism", "faith-based violence"],
70
+ "Religious Minorities' Rights": ["minorities", "blasphemy laws", "religious freedom", "forced conversion"]
71
+ },
72
+ "Grassroots Mobilization": {
73
+ "Community Activism": ["activism", "grassroots", "volunteering", "local organizing"],
74
+ "Protests & Demonstrations": ["march", "strike", "rally", "sit-in", "boycott"],
75
+ "Coalition Building": ["solidarity", "collaboration", "alliances", "mutual aid"]
76
+ },
77
+ "Environmental Crisis & Activism": {
78
+ "Climate Change Awareness": ["climate crisis", "global warming", "carbon emissions", "fossil fuels"],
79
+ "Conservation & Sustainability": ["deforestation", "wildlife protection", "biodiversity"],
80
+ "Environmental Justice": ["pollution", "water crisis", "land rights", "indigenous rights"]
81
+ },
82
+ "Anti-Extremism & Anti-Violence": {
83
+ "Hate Speech & Radicalization": ["hate speech", "extremism", "online radicalization", "propaganda"],
84
+ "Mob & Sectarian Violence": ["mob attack", "lynching", "sectarian violence", "hate crimes"],
85
+ "Counterterrorism & De-Radicalization": ["terrorism", "prevention", "peacebuilding", "rehabilitation"]
86
+ },
87
+ "Social Inequality & Economic Disparities": {
88
+ "Class Privilege & Labor Rights": ["classism", "labor rights", "unions", "wage gap"],
89
+ "Poverty & Economic Justice": ["poverty", "inequality", "economic disparity", "wealth gap"],
90
+ "Housing & Healthcare": ["housing crisis", "healthcare access", "social safety nets"]
91
+ },
92
+ "Activism & Advocacy": {
93
+ "Policy Advocacy & Legal Reforms": ["campaign", "policy change", "legal advocacy"],
94
+ "Social Media Activism": ["hashtags", "digital activism", "awareness campaign"],
95
+ "Freedom of Expression & Press": ["press freedom", "censorship", "media rights"]
96
+ },
97
+ "Systemic Oppression": {
98
+ "Marginalized Communities": ["minorities", "exclusion", "systemic discrimination"],
99
+ "Racial & Ethnic Discrimination": ["racism", "xenophobia", "ethnic cleansing", "casteism"],
100
+ "Institutional Bias": ["institutional racism", "structural oppression", "biased laws"]
101
+ },
102
+ "Intersectionality": {
103
+ "Multiple Oppressions": ["overlapping struggles", "intersecting identities", "double discrimination"],
104
+ "Women & Marginalized Identities": ["feminism", "queer feminism", "minority women"],
105
+ "Global Solidarity Movements": ["transnational activism", "cross-movement solidarity"]
106
+ },
107
+ "Call to Action": {
108
+ "Petitions & Direct Action": ["sign petition", "protest", "boycott"],
109
+ "Fundraising & Support": ["donate", "crowdfunding", "aid support"],
110
+ "Policy & Legislative Action": ["policy change", "demand action", "write to lawmakers"]
111
+ },
112
+ "Empowerment & Resistance": {
113
+ "Grassroots Organizing": ["community empowerment", "leadership training"],
114
+ "Revolutionary Movements": ["resistance", "revolt", "revolutionary change"],
115
+ "Inspiration & Motivational Messaging": ["hope", "courage", "overcoming struggles"]
116
+ },
117
+ "Climate Justice": {
118
+ "Indigenous Environmental Activism": ["land rights", "indigenous climate leadership"],
119
+ "Corporate Accountability": ["big oil", "corporate greed", "environmental negligence"],
120
+ "Sustainable Development": ["eco-friendly", "renewable energy", "circular economy"]
121
+ },
122
+ "Human Rights Advocacy": {
123
+ "Criminal Justice Reform": ["police brutality", "wrongful convictions", "prison reform"],
124
+ "Workplace Discrimination & Labor Rights": ["workplace bias", "equal pay", "unions"],
125
+ "International Human Rights": ["humanitarian law", "UN declarations", "international treaties"]
126
+ }
127
  }
128
 
129
+
130
  # Detect language
131
  def detect_language(text):
132
  try:
 
158
  def extract_hashtags(text):
159
  return re.findall(r"#\w+", text)
160
 
 
 
 
 
161
  # Categorize frames into Major, Significant, and Minor based on frequency
162
  def categorize_frames(frame_list):
163
  frame_counter = Counter(frame_list)
 
166
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
167
 
168
  for i, (frame, count) in enumerate(sorted_frames):
169
+ if i == 0: # Highest frequency frame
170
  categorized_frames["Major Focus"].append(frame)
171
+ elif i < 3: # Top 3 most mentioned frames
172
  categorized_frames["Significant Focus"].append(frame)
173
  else:
174
  categorized_frames["Minor Mention"].append(frame)
 
176
  return categorized_frames
177
 
178
  # Extract frames using keyword matching and categorize
179
+ def extract_frames_fallback(text, frame_categories):
180
  detected_frames = []
181
  text_lower = text.lower()
182
 
183
+ # Iterate through the activism topics to match keywords
184
+ for main_category, subcategories in frame_categories.items():
185
+ for subcategory, keywords in subcategories.items():
186
+ # Check how many keywords from the subcategory are present in the text
187
+ keyword_count = sum(1 for word in keywords if word in text_lower)
188
+ if keyword_count > 0:
189
+ # Append a tuple with main category and subcategory
190
+ detected_frames.append((main_category, subcategory))
191
 
192
+ # Categorize detected frames based on their frequency
193
  return categorize_frames(detected_frames)
194
 
 
 
 
 
 
 
 
 
 
 
195
  # Extract captions from DOCX
196
  def extract_captions_from_docx(docx_file):
197
  doc = Document(docx_file)
 
206
  captions[current_post].append(text)
207
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
208
 
209
+ # Extract metadata from Excel file
210
+ def extract_metadata_from_excel(excel_file):
211
+ try:
212
+ df = pd.read_excel(excel_file)
213
+ extracted_data = df.to_dict(orient="records")
214
+ return extracted_data
215
+ except Exception as e:
216
+ logging.error(f"Error processing Excel file: {e}")
217
+ return []
 
 
 
 
 
 
218
 
219
+ # Merge metadata with generated analysis
220
+ def merge_metadata_with_generated_data(generated_data, excel_metadata):
221
+ for post_data in excel_metadata:
222
+ post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
223
+ if post_number in generated_data:
224
+ generated_data[post_number].update(post_data)
225
+ else:
226
+ generated_data[post_number] = post_data
227
+ return generated_data
228
 
229
+ # Create DOCX file matching the uploaded format
230
  def create_docx_from_data(extracted_data):
231
  doc = Document()
232
 
233
+ for post_number, data in extracted_data.items():
234
+ doc.add_heading(post_number, level=1)
235
 
236
+ ordered_keys = [
237
+ "Post Number", "Date of Post", "Media Type", "Number of Pictures",
238
+ "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
239
+ "Full Caption", "Language", "Tone", "Hashtags", "Frames"
240
  ]
241
 
242
+ for key in ordered_keys:
243
+ value = data.get(key, "N/A")
 
244
 
245
+ if key in ["Tone", "Hashtags"]:
246
+ value = ", ".join(value) if isinstance(value, list) else value
247
+ elif key == "Frames" and isinstance(value, dict):
248
+ frame_text = "\n".join([f" {category}: {', '.join(frames)}" for category, frames in value.items() if frames])
249
+ value = f"\n{frame_text}" if frame_text else "N/A"
250
 
251
+ doc.add_paragraph(f"**{key}:** {value}")
 
 
 
 
 
 
 
 
252
 
253
  doc.add_paragraph("\n")
254
 
 
257
  # Streamlit app
258
  st.title("AI-Powered Activism Message Analyzer")
259
 
260
+ st.write("Enter text or upload a DOCX/Excel file for analysis:")
261
+
262
+ input_text = st.text_area("Input Text", height=200)
263
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
264
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
265
 
266
+ output_data = {}
267
+
268
+ if input_text:
269
+ output_data["Manual Input"] = {
270
+ "Full Caption": input_text,
271
+ "Language": detect_language(input_text),
272
+ "Tone": extract_tone(input_text),
273
+ "Hashtags": extract_hashtags(input_text),
274
+ "Frames": extract_frames_fallback(input_text),
275
+ }
276
+
277
+ if uploaded_docx:
278
+ captions = extract_captions_from_docx(uploaded_docx)
279
+ for caption, text in captions.items():
280
+ output_data[caption] = {
281
+ "Full Caption": text,
282
+ "Language": detect_language(text),
283
+ "Tone": extract_tone(text),
284
+ "Hashtags": extract_hashtags(text),
285
+ "Frames": extract_frames_fallback(text),
286
+ }
287
+
288
+ if uploaded_excel:
289
  excel_metadata = extract_metadata_from_excel(uploaded_excel)
290
+ output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
291
 
292
+ # Display results in collapsible sections for better UI
293
+ if output_data:
294
+ for post_number, data in output_data.items():
295
+ with st.expander(post_number):
296
+ for key, value in data.items():
297
+ st.write(f"**{key}:** {value}")
298
 
299
+ if output_data:
300
+ docx_output = create_docx_from_data(output_data)
301
  docx_io = io.BytesIO()
302
  docx_output.save(docx_io)
303
  docx_io.seek(0)
304
  st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")
305
+