Update app.py
Browse files
app.py
CHANGED
@@ -2,9 +2,7 @@ import pandas as pd
|
|
2 |
import streamlit as st
|
3 |
import re
|
4 |
import logging
|
5 |
-
import nltk
|
6 |
from docx import Document
|
7 |
-
import io
|
8 |
from langdetect import detect
|
9 |
from transformers import pipeline
|
10 |
from dotenv import load_dotenv
|
@@ -24,21 +22,7 @@ llm = ChatGroq(temperature=0.5, groq_api_key="GROQ_API_KEY", model_name="llama3-
|
|
24 |
# Download required NLTK resources
|
25 |
nltk.download("punkt")
|
26 |
|
27 |
-
#
|
28 |
-
tone_categories = {
|
29 |
-
"Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
|
30 |
-
"Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
|
31 |
-
"Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
|
32 |
-
"Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
|
33 |
-
"Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
|
34 |
-
"Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
|
35 |
-
"Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
|
36 |
-
"Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
|
37 |
-
"Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
|
38 |
-
"Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
|
39 |
-
}
|
40 |
-
|
41 |
-
# Frame categories for fallback method
|
42 |
frame_categories = {
|
43 |
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
|
44 |
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
|
@@ -84,10 +68,6 @@ def extract_tone_fallback(text):
|
|
84 |
detected_tones.add(category)
|
85 |
return list(detected_tones) if detected_tones else ["Neutral"]
|
86 |
|
87 |
-
# Extract hashtags
|
88 |
-
def extract_hashtags(text):
|
89 |
-
return re.findall(r"#\w+", text)
|
90 |
-
|
91 |
# Extract frames using Groq API (or fallback)
|
92 |
def extract_frames(text):
|
93 |
try:
|
@@ -98,27 +78,14 @@ def extract_frames(text):
|
|
98 |
logging.error(f"Groq API error: {e}")
|
99 |
return extract_frames_fallback(text)
|
100 |
|
101 |
-
# Fallback method for frame extraction
|
102 |
def extract_frames_fallback(text):
|
103 |
detected_frames = set()
|
104 |
-
frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
|
105 |
text_lower = text.lower()
|
106 |
-
|
107 |
for category, keywords in frame_categories.items():
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
elif keyword_count > 1:
|
112 |
-
frame_focus["Significant Focus"].append(category)
|
113 |
-
elif keyword_count > 0:
|
114 |
-
frame_focus["Minor Mention"].append(category)
|
115 |
-
|
116 |
-
# Return categorized frames
|
117 |
-
for focus, categories in frame_focus.items():
|
118 |
-
for category in categories:
|
119 |
-
detected_frames.add(f"{focus}: {category}")
|
120 |
-
|
121 |
-
return list(detected_frames)
|
122 |
|
123 |
# Extract captions from DOCX
|
124 |
def extract_captions_from_docx(docx_file):
|
@@ -134,48 +101,64 @@ def extract_captions_from_docx(docx_file):
|
|
134 |
captions[current_post].append(text)
|
135 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
136 |
|
137 |
-
#
|
138 |
def extract_metadata_from_excel(excel_file):
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
"Media Type": row.get("Media Type", "N/A"),
|
147 |
-
"Number of Pictures": row.get("Number of Pictures", 0),
|
148 |
-
"Number of Videos": row.get("Number of Videos", 0),
|
149 |
-
"Number of Audios": row.get("Number of Audios", 0),
|
150 |
-
"Likes": row.get("Likes", 0),
|
151 |
-
"Comments": row.get("Comments", 0),
|
152 |
-
"Tagged Audience": row.get("Tagged Audience", "No"),
|
153 |
-
}
|
154 |
-
extracted_data.append(post_data)
|
155 |
-
|
156 |
-
return extracted_data
|
157 |
|
158 |
# Merge metadata from Excel with the generated data
|
159 |
def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
160 |
-
for
|
161 |
-
|
162 |
-
|
163 |
-
generated_data[post_number].update(post_data)
|
164 |
-
else:
|
165 |
-
generated_data[post_number] = post_data
|
166 |
return generated_data
|
167 |
|
168 |
-
# Function to create DOCX
|
169 |
-
def
|
170 |
doc = Document()
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
doc.
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
st.title("AI-Powered Activism Message Analyzer")
|
180 |
|
181 |
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
@@ -192,53 +175,24 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
|
192 |
# Initialize output dictionary
|
193 |
output_data = {}
|
194 |
|
195 |
-
#
|
196 |
-
if input_text:
|
197 |
-
output_data["Manual Input"] = {
|
198 |
-
"Full Caption": input_text,
|
199 |
-
"Language": detect_language(input_text),
|
200 |
-
"Tone": extract_tone(input_text),
|
201 |
-
"Hashtags": extract_hashtags(input_text),
|
202 |
-
"Frames": extract_frames(input_text),
|
203 |
-
}
|
204 |
-
st.success("Analysis completed for text input.")
|
205 |
-
|
206 |
-
# Process DOCX file
|
207 |
if uploaded_docx:
|
208 |
-
|
209 |
-
for caption, text in captions.items():
|
210 |
-
output_data[caption] = {
|
211 |
-
"Full Caption": text,
|
212 |
-
"Language": detect_language(text),
|
213 |
-
"Tone": extract_tone(text),
|
214 |
-
"Hashtags": extract_hashtags(text),
|
215 |
-
"Frames": extract_frames(text),
|
216 |
-
}
|
217 |
-
st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
|
218 |
-
|
219 |
-
# Process Excel file
|
220 |
if uploaded_excel:
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
# Merge and display final data
|
225 |
-
if uploaded_excel:
|
226 |
-
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
|
227 |
-
|
228 |
-
# Display results
|
229 |
-
if output_data:
|
230 |
-
st.write(output_data)
|
231 |
|
232 |
-
#
|
233 |
if output_data:
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
|
|
|
2 |
import streamlit as st
|
3 |
import re
|
4 |
import logging
|
|
|
5 |
from docx import Document
|
|
|
6 |
from langdetect import detect
|
7 |
from transformers import pipeline
|
8 |
from dotenv import load_dotenv
|
|
|
22 |
# Download required NLTK resources
|
23 |
nltk.download("punkt")
|
24 |
|
25 |
+
# Frame categories for fallback method (with Major, Significant, Minor focus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
frame_categories = {
|
27 |
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
|
28 |
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
|
|
|
68 |
detected_tones.add(category)
|
69 |
return list(detected_tones) if detected_tones else ["Neutral"]
|
70 |
|
|
|
|
|
|
|
|
|
71 |
# Extract frames using Groq API (or fallback)
|
72 |
def extract_frames(text):
|
73 |
try:
|
|
|
78 |
logging.error(f"Groq API error: {e}")
|
79 |
return extract_frames_fallback(text)
|
80 |
|
81 |
+
# Fallback method for frame extraction
|
82 |
def extract_frames_fallback(text):
|
83 |
detected_frames = set()
|
|
|
84 |
text_lower = text.lower()
|
|
|
85 |
for category, keywords in frame_categories.items():
|
86 |
+
if any(word in text_lower for word in keywords):
|
87 |
+
detected_frames.add(f"{category}: Major Focus")
|
88 |
+
return list(detected_frames) if detected_frames else ["No Focus"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Extract captions from DOCX
|
91 |
def extract_captions_from_docx(docx_file):
|
|
|
101 |
captions[current_post].append(text)
|
102 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
103 |
|
104 |
+
# Extract metadata from Excel file
|
105 |
def extract_metadata_from_excel(excel_file):
|
106 |
+
try:
|
107 |
+
df = pd.read_excel(excel_file)
|
108 |
+
metadata = df.set_index("Post Number").to_dict(orient="index")
|
109 |
+
return metadata
|
110 |
+
except Exception as e:
|
111 |
+
logging.error(f"Error reading Excel file: {e}")
|
112 |
+
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
# Merge metadata from Excel with the generated data
|
115 |
def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
116 |
+
for post, metadata in excel_metadata.items():
|
117 |
+
if post in generated_data:
|
118 |
+
generated_data[post].update(metadata)
|
|
|
|
|
|
|
119 |
return generated_data
|
120 |
|
121 |
+
# Function to create the final DOCX with structured output (without tables)
|
122 |
+
def create_structured_output_without_table(merged_data, output_path):
|
123 |
doc = Document()
|
124 |
+
doc.add_heading('Extracted Social Media Data', 0)
|
125 |
+
|
126 |
+
# Loop through each post and add its structured data
|
127 |
+
for sr_no, (post, data) in enumerate(merged_data.items(), 1):
|
128 |
+
doc.add_heading(f'Post {sr_no}', level=1)
|
129 |
+
|
130 |
+
# Adding the details for each post
|
131 |
+
doc.add_paragraph(f"Date of Post: {data.get('Date of Post', 'N/A')}")
|
132 |
+
doc.add_paragraph(f"Media Type: {data.get('Media Type', 'N/A')}")
|
133 |
+
doc.add_paragraph(f"No of Pictures: {data.get('No of Pictures', 0)}")
|
134 |
+
doc.add_paragraph(f"No of Videos: {data.get('No of Videos', 0)}")
|
135 |
+
doc.add_paragraph(f"No of Audios: {data.get('No of Audios', 0)}")
|
136 |
+
doc.add_paragraph(f"Likes: {data.get('Likes', 'N/A')}")
|
137 |
+
doc.add_paragraph(f"Comments: {data.get('Comments', 'N/A')}")
|
138 |
+
doc.add_paragraph(f"Tagged Audience: {data.get('Tagged Audience', 'No')}")
|
139 |
+
doc.add_paragraph(f"Caption: {data.get('Full Caption', 'N/A')}")
|
140 |
+
doc.add_paragraph(f"Language of Caption: {data.get('Language', 'N/A')}")
|
141 |
+
doc.add_paragraph(f"Total No of Hashtags: {len(data.get('Hashtags', []))}")
|
142 |
+
|
143 |
+
if data.get('Hashtags'):
|
144 |
+
doc.add_paragraph(f"Hashtags: {', '.join(data['Hashtags'])}")
|
145 |
+
else:
|
146 |
+
doc.add_paragraph("Hashtags: N/A")
|
147 |
+
|
148 |
+
# Adding Frames for each post
|
149 |
+
doc.add_heading("Frames", level=2)
|
150 |
+
if data.get("Frames"):
|
151 |
+
for frame in data['Frames']:
|
152 |
+
doc.add_paragraph(f"- {frame}")
|
153 |
+
else:
|
154 |
+
doc.add_paragraph("No Frames available")
|
155 |
+
|
156 |
+
doc.add_paragraph("\n") # Add a space between posts
|
157 |
+
|
158 |
+
# Save the document
|
159 |
+
doc.save(output_path)
|
160 |
+
|
161 |
+
# Streamlit app setup
|
162 |
st.title("AI-Powered Activism Message Analyzer")
|
163 |
|
164 |
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
|
|
175 |
# Initialize output dictionary
|
176 |
output_data = {}
|
177 |
|
178 |
+
# Extract and process data based on file uploads or input text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
if uploaded_docx:
|
180 |
+
output_data = extract_captions_from_docx(uploaded_docx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
if uploaded_excel:
|
182 |
+
metadata = extract_metadata_from_excel(uploaded_excel)
|
183 |
+
output_data = merge_metadata_with_generated_data(output_data, metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
# Generate output
|
186 |
if output_data:
|
187 |
+
# Process each post to extract frames
|
188 |
+
for post, data in output_data.items():
|
189 |
+
# Extract frames using Groq API or fallback method
|
190 |
+
frames = extract_frames(data)
|
191 |
+
data['Frames'] = frames
|
192 |
+
|
193 |
+
# Call the function to generate the DOCX report
|
194 |
+
create_structured_output_without_table(output_data, "final_output.docx")
|
195 |
+
st.write("The DOCX file has been created and saved!")
|
196 |
+
st.download_button("Download DOCX", data=open("final_output.docx", "rb"), file_name="final_output.docx")
|
197 |
+
|
198 |
+
# Further refinement can be added for additional features as necessary
|