ahm14 commited on
Commit
2bc09f4
·
verified ·
1 Parent(s): f4228a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -119
app.py CHANGED
@@ -2,9 +2,7 @@ import pandas as pd
2
  import streamlit as st
3
  import re
4
  import logging
5
- import nltk
6
  from docx import Document
7
- import io
8
  from langdetect import detect
9
  from transformers import pipeline
10
  from dotenv import load_dotenv
@@ -24,21 +22,7 @@ llm = ChatGroq(temperature=0.5, groq_api_key="GROQ_API_KEY", model_name="llama3-
24
  # Download required NLTK resources
25
  nltk.download("punkt")
26
 
27
- # Tone categories for fallback method
28
- tone_categories = {
29
- "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
30
- "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
31
- "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
32
- "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
33
- "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
34
- "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
35
- "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
36
- "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
37
- "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
38
- "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
39
- }
40
-
41
- # Frame categories for fallback method
42
  frame_categories = {
43
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
44
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -84,10 +68,6 @@ def extract_tone_fallback(text):
84
  detected_tones.add(category)
85
  return list(detected_tones) if detected_tones else ["Neutral"]
86
 
87
- # Extract hashtags
88
- def extract_hashtags(text):
89
- return re.findall(r"#\w+", text)
90
-
91
  # Extract frames using Groq API (or fallback)
92
  def extract_frames(text):
93
  try:
@@ -98,27 +78,14 @@ def extract_frames(text):
98
  logging.error(f"Groq API error: {e}")
99
  return extract_frames_fallback(text)
100
 
101
- # Fallback method for frame extraction (with categorization of Major, Significant, Minor)
102
  def extract_frames_fallback(text):
103
  detected_frames = set()
104
- frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
105
  text_lower = text.lower()
106
-
107
  for category, keywords in frame_categories.items():
108
- keyword_count = sum(word in text_lower for word in keywords)
109
- if keyword_count > 3:
110
- frame_focus["Major Focus"].append(category)
111
- elif keyword_count > 1:
112
- frame_focus["Significant Focus"].append(category)
113
- elif keyword_count > 0:
114
- frame_focus["Minor Mention"].append(category)
115
-
116
- # Return categorized frames
117
- for focus, categories in frame_focus.items():
118
- for category in categories:
119
- detected_frames.add(f"{focus}: {category}")
120
-
121
- return list(detected_frames)
122
 
123
  # Extract captions from DOCX
124
  def extract_captions_from_docx(docx_file):
@@ -134,48 +101,64 @@ def extract_captions_from_docx(docx_file):
134
  captions[current_post].append(text)
135
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
136
 
137
- # Function to extract metadata from an Excel file
138
  def extract_metadata_from_excel(excel_file):
139
- df = pd.read_excel(excel_file)
140
- extracted_data = []
141
-
142
- for index, row in df.iterrows():
143
- post_data = {
144
- "Post Number": f"Post {index + 1}",
145
- "Date of Post": row.get("Date", "N/A"),
146
- "Media Type": row.get("Media Type", "N/A"),
147
- "Number of Pictures": row.get("Number of Pictures", 0),
148
- "Number of Videos": row.get("Number of Videos", 0),
149
- "Number of Audios": row.get("Number of Audios", 0),
150
- "Likes": row.get("Likes", 0),
151
- "Comments": row.get("Comments", 0),
152
- "Tagged Audience": row.get("Tagged Audience", "No"),
153
- }
154
- extracted_data.append(post_data)
155
-
156
- return extracted_data
157
 
158
  # Merge metadata from Excel with the generated data
159
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
160
- for post_data in excel_metadata:
161
- post_number = post_data["Post Number"]
162
- if post_number in generated_data:
163
- generated_data[post_number].update(post_data)
164
- else:
165
- generated_data[post_number] = post_data
166
  return generated_data
167
 
168
- # Function to create DOCX from extracted data
169
- def create_docx_from_data(extracted_data):
170
  doc = Document()
171
- for post_number, data in extracted_data.items():
172
- doc.add_heading(post_number, level=1)
173
- for key, value in data.items():
174
- doc.add_paragraph(f"{key}: {value}")
175
- doc.add_paragraph("\n") # Add a line break between posts
176
- return doc
177
-
178
- # Streamlit app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  st.title("AI-Powered Activism Message Analyzer")
180
 
181
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
@@ -192,53 +175,24 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
192
  # Initialize output dictionary
193
  output_data = {}
194
 
195
- # Process Text Input
196
- if input_text:
197
- output_data["Manual Input"] = {
198
- "Full Caption": input_text,
199
- "Language": detect_language(input_text),
200
- "Tone": extract_tone(input_text),
201
- "Hashtags": extract_hashtags(input_text),
202
- "Frames": extract_frames(input_text),
203
- }
204
- st.success("Analysis completed for text input.")
205
-
206
- # Process DOCX file
207
  if uploaded_docx:
208
- captions = extract_captions_from_docx(uploaded_docx)
209
- for caption, text in captions.items():
210
- output_data[caption] = {
211
- "Full Caption": text,
212
- "Language": detect_language(text),
213
- "Tone": extract_tone(text),
214
- "Hashtags": extract_hashtags(text),
215
- "Frames": extract_frames(text),
216
- }
217
- st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
218
-
219
- # Process Excel file
220
  if uploaded_excel:
221
- excel_metadata = extract_metadata_from_excel(uploaded_excel)
222
- st.success(f"Excel metadata extracted with {len(excel_metadata)} posts.")
223
-
224
- # Merge and display final data
225
- if uploaded_excel:
226
- output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
227
-
228
- # Display results
229
- if output_data:
230
- st.write(output_data)
231
 
232
- # Create DOCX file for download
233
  if output_data:
234
- doc = create_docx_from_data(output_data)
235
- docx_io = io.BytesIO()
236
- doc.save(docx_io)
237
- docx_io.seek(0)
238
-
239
- st.download_button(
240
- label="Download Extracted Data as DOCX",
241
- data=docx_io,
242
- file_name="extracted_data.docx",
243
- mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
244
- )
 
 
2
  import streamlit as st
3
  import re
4
  import logging
 
5
  from docx import Document
 
6
  from langdetect import detect
7
  from transformers import pipeline
8
  from dotenv import load_dotenv
 
22
  # Download required NLTK resources
23
  nltk.download("punkt")
24
 
25
+ # Frame categories for fallback method (with Major, Significant, Minor focus)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  frame_categories = {
27
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
28
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
 
68
  detected_tones.add(category)
69
  return list(detected_tones) if detected_tones else ["Neutral"]
70
 
 
 
 
 
71
  # Extract frames using Groq API (or fallback)
72
  def extract_frames(text):
73
  try:
 
78
  logging.error(f"Groq API error: {e}")
79
  return extract_frames_fallback(text)
80
 
81
+ # Fallback method for frame extraction
82
  def extract_frames_fallback(text):
83
  detected_frames = set()
 
84
  text_lower = text.lower()
 
85
  for category, keywords in frame_categories.items():
86
+ if any(word in text_lower for word in keywords):
87
+ detected_frames.add(f"{category}: Major Focus")
88
+ return list(detected_frames) if detected_frames else ["No Focus"]
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # Extract captions from DOCX
91
  def extract_captions_from_docx(docx_file):
 
101
  captions[current_post].append(text)
102
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
103
 
104
+ # Extract metadata from Excel file
105
  def extract_metadata_from_excel(excel_file):
106
+ try:
107
+ df = pd.read_excel(excel_file)
108
+ metadata = df.set_index("Post Number").to_dict(orient="index")
109
+ return metadata
110
+ except Exception as e:
111
+ logging.error(f"Error reading Excel file: {e}")
112
+ return {}
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  # Merge metadata from Excel with the generated data
115
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
116
+ for post, metadata in excel_metadata.items():
117
+ if post in generated_data:
118
+ generated_data[post].update(metadata)
 
 
 
119
  return generated_data
120
 
121
+ # Function to create the final DOCX with structured output (without tables)
122
+ def create_structured_output_without_table(merged_data, output_path):
123
  doc = Document()
124
+ doc.add_heading('Extracted Social Media Data', 0)
125
+
126
+ # Loop through each post and add its structured data
127
+ for sr_no, (post, data) in enumerate(merged_data.items(), 1):
128
+ doc.add_heading(f'Post {sr_no}', level=1)
129
+
130
+ # Adding the details for each post
131
+ doc.add_paragraph(f"Date of Post: {data.get('Date of Post', 'N/A')}")
132
+ doc.add_paragraph(f"Media Type: {data.get('Media Type', 'N/A')}")
133
+ doc.add_paragraph(f"No of Pictures: {data.get('No of Pictures', 0)}")
134
+ doc.add_paragraph(f"No of Videos: {data.get('No of Videos', 0)}")
135
+ doc.add_paragraph(f"No of Audios: {data.get('No of Audios', 0)}")
136
+ doc.add_paragraph(f"Likes: {data.get('Likes', 'N/A')}")
137
+ doc.add_paragraph(f"Comments: {data.get('Comments', 'N/A')}")
138
+ doc.add_paragraph(f"Tagged Audience: {data.get('Tagged Audience', 'No')}")
139
+ doc.add_paragraph(f"Caption: {data.get('Full Caption', 'N/A')}")
140
+ doc.add_paragraph(f"Language of Caption: {data.get('Language', 'N/A')}")
141
+ doc.add_paragraph(f"Total No of Hashtags: {len(data.get('Hashtags', []))}")
142
+
143
+ if data.get('Hashtags'):
144
+ doc.add_paragraph(f"Hashtags: {', '.join(data['Hashtags'])}")
145
+ else:
146
+ doc.add_paragraph("Hashtags: N/A")
147
+
148
+ # Adding Frames for each post
149
+ doc.add_heading("Frames", level=2)
150
+ if data.get("Frames"):
151
+ for frame in data['Frames']:
152
+ doc.add_paragraph(f"- {frame}")
153
+ else:
154
+ doc.add_paragraph("No Frames available")
155
+
156
+ doc.add_paragraph("\n") # Add a space between posts
157
+
158
+ # Save the document
159
+ doc.save(output_path)
160
+
161
+ # Streamlit app setup
162
  st.title("AI-Powered Activism Message Analyzer")
163
 
164
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
 
175
  # Initialize output dictionary
176
  output_data = {}
177
 
178
+ # Extract and process data based on file uploads or input text
 
 
 
 
 
 
 
 
 
 
 
179
  if uploaded_docx:
180
+ output_data = extract_captions_from_docx(uploaded_docx)
 
 
 
 
 
 
 
 
 
 
 
181
  if uploaded_excel:
182
+ metadata = extract_metadata_from_excel(uploaded_excel)
183
+ output_data = merge_metadata_with_generated_data(output_data, metadata)
 
 
 
 
 
 
 
 
184
 
185
+ # Generate output
186
  if output_data:
187
+ # Process each post to extract frames
188
+ for post, data in output_data.items():
189
+ # Extract frames using Groq API or fallback method
190
+ frames = extract_frames(data)
191
+ data['Frames'] = frames
192
+
193
+ # Call the function to generate the DOCX report
194
+ create_structured_output_without_table(output_data, "final_output.docx")
195
+ st.write("The DOCX file has been created and saved!")
196
+ st.download_button("Download DOCX", data=open("final_output.docx", "rb"), file_name="final_output.docx")
197
+
198
+ # Further refinement can be added for additional features as necessary