Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,7 @@ llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b
|
|
33 |
# Download required NLTK resources
|
34 |
nltk.download("punkt")
|
35 |
|
36 |
-
#
|
37 |
tone_categories = {
|
38 |
"Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
|
39 |
"Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
|
@@ -41,32 +41,92 @@ tone_categories = {
|
|
41 |
"Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
|
42 |
"Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
|
43 |
"Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
|
44 |
-
"Happy": ["joy", "celebration", "cheer", "success", "smile", "gratitude", "harmony"],
|
45 |
"Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
|
46 |
"Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
|
47 |
"Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
|
48 |
"Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
|
49 |
}
|
50 |
|
51 |
-
#
|
|
|
52 |
frame_categories = {
|
53 |
-
"Human Rights & Justice":
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
"
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
"
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
}
|
69 |
|
|
|
70 |
# Detect language
|
71 |
def detect_language(text):
|
72 |
try:
|
@@ -98,10 +158,6 @@ def extract_tone_fallback(text):
|
|
98 |
def extract_hashtags(text):
|
99 |
return re.findall(r"#\w+", text)
|
100 |
|
101 |
-
# Extract hashtags
|
102 |
-
def extract_hashtags(text):
|
103 |
-
return re.findall(r"#\w+", text)
|
104 |
-
|
105 |
# Categorize frames into Major, Significant, and Minor based on frequency
|
106 |
def categorize_frames(frame_list):
|
107 |
frame_counter = Counter(frame_list)
|
@@ -110,9 +166,9 @@ def categorize_frames(frame_list):
|
|
110 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
111 |
|
112 |
for i, (frame, count) in enumerate(sorted_frames):
|
113 |
-
if i == 0:
|
114 |
categorized_frames["Major Focus"].append(frame)
|
115 |
-
elif i < 3:
|
116 |
categorized_frames["Significant Focus"].append(frame)
|
117 |
else:
|
118 |
categorized_frames["Minor Mention"].append(frame)
|
@@ -120,27 +176,22 @@ def categorize_frames(frame_list):
|
|
120 |
return categorized_frames
|
121 |
|
122 |
# Extract frames using keyword matching and categorize
|
123 |
-
def extract_frames_fallback(text):
|
124 |
detected_frames = []
|
125 |
text_lower = text.lower()
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
131 |
|
|
|
132 |
return categorize_frames(detected_frames)
|
133 |
|
134 |
-
# Extract metadata from Excel file
|
135 |
-
def extract_metadata_from_excel(excel_file):
|
136 |
-
try:
|
137 |
-
df = pd.read_excel(excel_file)
|
138 |
-
extracted_data = df.to_dict(orient="records")
|
139 |
-
return extracted_data
|
140 |
-
except Exception as e:
|
141 |
-
logging.error(f"Error processing Excel file: {e}")
|
142 |
-
return []
|
143 |
-
|
144 |
# Extract captions from DOCX
|
145 |
def extract_captions_from_docx(docx_file):
|
146 |
doc = Document(docx_file)
|
@@ -155,51 +206,49 @@ def extract_captions_from_docx(docx_file):
|
|
155 |
captions[current_post].append(text)
|
156 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
157 |
|
158 |
-
#
|
159 |
-
def
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
post_data["Language"] = detect_language(caption_text)
|
168 |
-
post_data["Tone"] = extract_tone(caption_text)
|
169 |
-
post_data["Hashtags"] = extract_hashtags(caption_text)
|
170 |
-
post_data["Frames"] = extract_frames_fallback(caption_text)
|
171 |
-
|
172 |
-
merged_data.append(post_data)
|
173 |
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
# Create DOCX file
|
177 |
def create_docx_from_data(extracted_data):
|
178 |
doc = Document()
|
179 |
|
180 |
-
for
|
181 |
-
doc.add_heading(
|
182 |
|
183 |
-
|
184 |
-
"Date of Post", "Media Type", "Number of Pictures",
|
185 |
-
"Number of Audios", "Likes", "Comments", "Tagged Audience"
|
|
|
186 |
]
|
187 |
|
188 |
-
for
|
189 |
-
value = data.get(
|
190 |
-
doc.add_paragraph(f"**{field}:** {value}")
|
191 |
|
192 |
-
|
|
|
|
|
|
|
|
|
193 |
|
194 |
-
|
195 |
-
doc.add_paragraph(f"**Tone:** {', '.join(data.get('Tone', ['N/A']))}")
|
196 |
-
doc.add_paragraph(f"**Hashtags:** {', '.join(data.get('Hashtags', []))}")
|
197 |
-
|
198 |
-
frames = data.get("Frames", {})
|
199 |
-
doc.add_paragraph("**Frames:**")
|
200 |
-
for category, frame_list in frames.items():
|
201 |
-
if frame_list:
|
202 |
-
doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
|
203 |
|
204 |
doc.add_paragraph("\n")
|
205 |
|
@@ -208,17 +257,49 @@ def create_docx_from_data(extracted_data):
|
|
208 |
# Streamlit app
|
209 |
st.title("AI-Powered Activism Message Analyzer")
|
210 |
|
|
|
|
|
|
|
211 |
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
|
212 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
213 |
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
excel_metadata = extract_metadata_from_excel(uploaded_excel)
|
216 |
-
|
217 |
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
220 |
|
|
|
|
|
221 |
docx_io = io.BytesIO()
|
222 |
docx_output.save(docx_io)
|
223 |
docx_io.seek(0)
|
224 |
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")
|
|
|
|
33 |
# Download required NLTK resources
|
34 |
nltk.download("punkt")
|
35 |
|
36 |
+
# Tone categories for fallback method
|
37 |
tone_categories = {
|
38 |
"Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
|
39 |
"Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
|
|
|
41 |
"Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
|
42 |
"Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
|
43 |
"Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
|
|
|
44 |
"Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
|
45 |
"Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
|
46 |
"Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
|
47 |
"Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
|
48 |
}
|
49 |
|
50 |
+
# Frame categories for fallback method
|
51 |
+
|
52 |
frame_categories = {
|
53 |
+
"Human Rights & Justice": {
|
54 |
+
"Legal Rights & Reforms": ["law", "justice", "legal", "reforms", "legislation"],
|
55 |
+
"Humanitarian Issues": ["humanitarian", "aid", "refugees", "asylum", "crisis response"],
|
56 |
+
"Civil Liberties": ["freedom", "expression", "privacy", "rights violations"]
|
57 |
+
},
|
58 |
+
"Political & State Accountability": {
|
59 |
+
"Corruption & Governance": ["corruption", "government", "policy", "accountability", "transparency"],
|
60 |
+
"Political Oppression": ["authoritarianism", "censorship", "state control", "dissent", "crackdown"],
|
61 |
+
"Elections & Political Representation": ["voting", "elections", "political participation", "democracy"]
|
62 |
+
},
|
63 |
+
"Gender & Patriarchy": {
|
64 |
+
"Gender-Based Violence": ["violence", "domestic abuse", "sexual harassment", "femicide"],
|
65 |
+
"Women's Rights & Equality": ["gender equality", "feminism", "reproductive rights", "patriarchy"],
|
66 |
+
"LGBTQ+ Rights": ["queer rights", "LGBTQ+", "gender identity", "trans rights", "homophobia"]
|
67 |
+
},
|
68 |
+
"Religious Freedom & Persecution": {
|
69 |
+
"Religious Discrimination": ["persecution", "intolerance", "sectarianism", "faith-based violence"],
|
70 |
+
"Religious Minorities' Rights": ["minorities", "blasphemy laws", "religious freedom", "forced conversion"]
|
71 |
+
},
|
72 |
+
"Grassroots Mobilization": {
|
73 |
+
"Community Activism": ["activism", "grassroots", "volunteering", "local organizing"],
|
74 |
+
"Protests & Demonstrations": ["march", "strike", "rally", "sit-in", "boycott"],
|
75 |
+
"Coalition Building": ["solidarity", "collaboration", "alliances", "mutual aid"]
|
76 |
+
},
|
77 |
+
"Environmental Crisis & Activism": {
|
78 |
+
"Climate Change Awareness": ["climate crisis", "global warming", "carbon emissions", "fossil fuels"],
|
79 |
+
"Conservation & Sustainability": ["deforestation", "wildlife protection", "biodiversity"],
|
80 |
+
"Environmental Justice": ["pollution", "water crisis", "land rights", "indigenous rights"]
|
81 |
+
},
|
82 |
+
"Anti-Extremism & Anti-Violence": {
|
83 |
+
"Hate Speech & Radicalization": ["hate speech", "extremism", "online radicalization", "propaganda"],
|
84 |
+
"Mob & Sectarian Violence": ["mob attack", "lynching", "sectarian violence", "hate crimes"],
|
85 |
+
"Counterterrorism & De-Radicalization": ["terrorism", "prevention", "peacebuilding", "rehabilitation"]
|
86 |
+
},
|
87 |
+
"Social Inequality & Economic Disparities": {
|
88 |
+
"Class Privilege & Labor Rights": ["classism", "labor rights", "unions", "wage gap"],
|
89 |
+
"Poverty & Economic Justice": ["poverty", "inequality", "economic disparity", "wealth gap"],
|
90 |
+
"Housing & Healthcare": ["housing crisis", "healthcare access", "social safety nets"]
|
91 |
+
},
|
92 |
+
"Activism & Advocacy": {
|
93 |
+
"Policy Advocacy & Legal Reforms": ["campaign", "policy change", "legal advocacy"],
|
94 |
+
"Social Media Activism": ["hashtags", "digital activism", "awareness campaign"],
|
95 |
+
"Freedom of Expression & Press": ["press freedom", "censorship", "media rights"]
|
96 |
+
},
|
97 |
+
"Systemic Oppression": {
|
98 |
+
"Marginalized Communities": ["minorities", "exclusion", "systemic discrimination"],
|
99 |
+
"Racial & Ethnic Discrimination": ["racism", "xenophobia", "ethnic cleansing", "casteism"],
|
100 |
+
"Institutional Bias": ["institutional racism", "structural oppression", "biased laws"]
|
101 |
+
},
|
102 |
+
"Intersectionality": {
|
103 |
+
"Multiple Oppressions": ["overlapping struggles", "intersecting identities", "double discrimination"],
|
104 |
+
"Women & Marginalized Identities": ["feminism", "queer feminism", "minority women"],
|
105 |
+
"Global Solidarity Movements": ["transnational activism", "cross-movement solidarity"]
|
106 |
+
},
|
107 |
+
"Call to Action": {
|
108 |
+
"Petitions & Direct Action": ["sign petition", "protest", "boycott"],
|
109 |
+
"Fundraising & Support": ["donate", "crowdfunding", "aid support"],
|
110 |
+
"Policy & Legislative Action": ["policy change", "demand action", "write to lawmakers"]
|
111 |
+
},
|
112 |
+
"Empowerment & Resistance": {
|
113 |
+
"Grassroots Organizing": ["community empowerment", "leadership training"],
|
114 |
+
"Revolutionary Movements": ["resistance", "revolt", "revolutionary change"],
|
115 |
+
"Inspiration & Motivational Messaging": ["hope", "courage", "overcoming struggles"]
|
116 |
+
},
|
117 |
+
"Climate Justice": {
|
118 |
+
"Indigenous Environmental Activism": ["land rights", "indigenous climate leadership"],
|
119 |
+
"Corporate Accountability": ["big oil", "corporate greed", "environmental negligence"],
|
120 |
+
"Sustainable Development": ["eco-friendly", "renewable energy", "circular economy"]
|
121 |
+
},
|
122 |
+
"Human Rights Advocacy": {
|
123 |
+
"Criminal Justice Reform": ["police brutality", "wrongful convictions", "prison reform"],
|
124 |
+
"Workplace Discrimination & Labor Rights": ["workplace bias", "equal pay", "unions"],
|
125 |
+
"International Human Rights": ["humanitarian law", "UN declarations", "international treaties"]
|
126 |
+
}
|
127 |
}
|
128 |
|
129 |
+
|
130 |
# Detect language
|
131 |
def detect_language(text):
|
132 |
try:
|
|
|
158 |
def extract_hashtags(text):
|
159 |
return re.findall(r"#\w+", text)
|
160 |
|
|
|
|
|
|
|
|
|
161 |
# Categorize frames into Major, Significant, and Minor based on frequency
|
162 |
def categorize_frames(frame_list):
|
163 |
frame_counter = Counter(frame_list)
|
|
|
166 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
167 |
|
168 |
for i, (frame, count) in enumerate(sorted_frames):
|
169 |
+
if i == 0: # Highest frequency frame
|
170 |
categorized_frames["Major Focus"].append(frame)
|
171 |
+
elif i < 3: # Top 3 most mentioned frames
|
172 |
categorized_frames["Significant Focus"].append(frame)
|
173 |
else:
|
174 |
categorized_frames["Minor Mention"].append(frame)
|
|
|
176 |
return categorized_frames
|
177 |
|
178 |
# Extract frames using keyword matching and categorize
|
179 |
+
def extract_frames_fallback(text, frame_categories):
|
180 |
detected_frames = []
|
181 |
text_lower = text.lower()
|
182 |
|
183 |
+
# Iterate through the activism topics to match keywords
|
184 |
+
for main_category, subcategories in frame_categories.items():
|
185 |
+
for subcategory, keywords in subcategories.items():
|
186 |
+
# Check how many keywords from the subcategory are present in the text
|
187 |
+
keyword_count = sum(1 for word in keywords if word in text_lower)
|
188 |
+
if keyword_count > 0:
|
189 |
+
# Append a tuple with main category and subcategory
|
190 |
+
detected_frames.append((main_category, subcategory))
|
191 |
|
192 |
+
# Categorize detected frames based on their frequency
|
193 |
return categorize_frames(detected_frames)
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
# Extract captions from DOCX
|
196 |
def extract_captions_from_docx(docx_file):
|
197 |
doc = Document(docx_file)
|
|
|
206 |
captions[current_post].append(text)
|
207 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
208 |
|
209 |
+
# Extract metadata from Excel file
|
210 |
+
def extract_metadata_from_excel(excel_file):
|
211 |
+
try:
|
212 |
+
df = pd.read_excel(excel_file)
|
213 |
+
extracted_data = df.to_dict(orient="records")
|
214 |
+
return extracted_data
|
215 |
+
except Exception as e:
|
216 |
+
logging.error(f"Error processing Excel file: {e}")
|
217 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
+
# Merge metadata with generated analysis
|
220 |
+
def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
221 |
+
for post_data in excel_metadata:
|
222 |
+
post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
|
223 |
+
if post_number in generated_data:
|
224 |
+
generated_data[post_number].update(post_data)
|
225 |
+
else:
|
226 |
+
generated_data[post_number] = post_data
|
227 |
+
return generated_data
|
228 |
|
229 |
+
# Create DOCX file matching the uploaded format
|
230 |
def create_docx_from_data(extracted_data):
|
231 |
doc = Document()
|
232 |
|
233 |
+
for post_number, data in extracted_data.items():
|
234 |
+
doc.add_heading(post_number, level=1)
|
235 |
|
236 |
+
ordered_keys = [
|
237 |
+
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
|
238 |
+
"Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
|
239 |
+
"Full Caption", "Language", "Tone", "Hashtags", "Frames"
|
240 |
]
|
241 |
|
242 |
+
for key in ordered_keys:
|
243 |
+
value = data.get(key, "N/A")
|
|
|
244 |
|
245 |
+
if key in ["Tone", "Hashtags"]:
|
246 |
+
value = ", ".join(value) if isinstance(value, list) else value
|
247 |
+
elif key == "Frames" and isinstance(value, dict):
|
248 |
+
frame_text = "\n".join([f" {category}: {', '.join(frames)}" for category, frames in value.items() if frames])
|
249 |
+
value = f"\n{frame_text}" if frame_text else "N/A"
|
250 |
|
251 |
+
doc.add_paragraph(f"**{key}:** {value}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
doc.add_paragraph("\n")
|
254 |
|
|
|
257 |
# Streamlit app
|
258 |
st.title("AI-Powered Activism Message Analyzer")
|
259 |
|
260 |
+
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
261 |
+
|
262 |
+
input_text = st.text_area("Input Text", height=200)
|
263 |
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
|
264 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
265 |
|
266 |
+
output_data = {}
|
267 |
+
|
268 |
+
if input_text:
|
269 |
+
output_data["Manual Input"] = {
|
270 |
+
"Full Caption": input_text,
|
271 |
+
"Language": detect_language(input_text),
|
272 |
+
"Tone": extract_tone(input_text),
|
273 |
+
"Hashtags": extract_hashtags(input_text),
|
274 |
+
"Frames": extract_frames_fallback(input_text),
|
275 |
+
}
|
276 |
+
|
277 |
+
if uploaded_docx:
|
278 |
+
captions = extract_captions_from_docx(uploaded_docx)
|
279 |
+
for caption, text in captions.items():
|
280 |
+
output_data[caption] = {
|
281 |
+
"Full Caption": text,
|
282 |
+
"Language": detect_language(text),
|
283 |
+
"Tone": extract_tone(text),
|
284 |
+
"Hashtags": extract_hashtags(text),
|
285 |
+
"Frames": extract_frames_fallback(text),
|
286 |
+
}
|
287 |
+
|
288 |
+
if uploaded_excel:
|
289 |
excel_metadata = extract_metadata_from_excel(uploaded_excel)
|
290 |
+
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
|
291 |
|
292 |
+
# Display results in collapsible sections for better UI
|
293 |
+
if output_data:
|
294 |
+
for post_number, data in output_data.items():
|
295 |
+
with st.expander(post_number):
|
296 |
+
for key, value in data.items():
|
297 |
+
st.write(f"**{key}:** {value}")
|
298 |
|
299 |
+
if output_data:
|
300 |
+
docx_output = create_docx_from_data(output_data)
|
301 |
docx_io = io.BytesIO()
|
302 |
docx_output.save(docx_io)
|
303 |
docx_io.seek(0)
|
304 |
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")
|
305 |
+
|