ahm14 commited on
Commit
40be765
·
verified ·
1 Parent(s): 609d4a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -60
app.py CHANGED
@@ -98,13 +98,26 @@ def extract_frames(text):
98
  logging.error(f"Groq API error: {e}")
99
  return extract_frames_fallback(text)
100
 
101
- # Fallback method for frame extraction
102
  def extract_frames_fallback(text):
103
  detected_frames = set()
 
104
  text_lower = text.lower()
 
105
  for category, keywords in frame_categories.items():
106
- if any(word in text_lower for word in keywords):
107
- detected_frames.add(category)
 
 
 
 
 
 
 
 
 
 
 
108
  return list(detected_frames)
109
 
110
  # Extract captions from DOCX
@@ -121,64 +134,55 @@ def extract_captions_from_docx(docx_file):
121
  captions[current_post].append(text)
122
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
123
 
124
- # Extract metadata from Excel file
125
  def extract_metadata_from_excel(excel_file):
126
  try:
127
  df = pd.read_excel(excel_file)
128
- metadata = df.set_index("Post Number").to_dict(orient="index")
129
- return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  except Exception as e:
131
- logging.error(f"Error reading Excel file: {e}")
132
- return {}
133
 
134
  # Merge metadata from Excel with the generated data
135
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
136
- for post, metadata in excel_metadata.items():
137
- if post in generated_data:
138
- generated_data[post].update(metadata)
 
 
 
139
  return generated_data
140
 
141
- # Function to create the final DOCX with structured output (without tables)
142
- def create_structured_output_without_table(merged_data, output_path):
143
  doc = Document()
144
- doc.add_heading('Extracted Social Media Data', 0)
145
-
146
- # Loop through each post and add its structured data
147
- for sr_no, (post, data) in enumerate(merged_data.items(), 1):
148
- doc.add_heading(f'Post {sr_no}', level=1)
149
-
150
- # Adding the details for each post
151
- doc.add_paragraph(f"Date of Post: {data.get('Date of Post', 'N/A')}")
152
- doc.add_paragraph(f"Media Type: {data.get('Media Type', 'N/A')}")
153
- doc.add_paragraph(f"No of Pictures: {data.get('No of Pictures', 0)}")
154
- doc.add_paragraph(f"No of Videos: {data.get('No of Videos', 0)}")
155
- doc.add_paragraph(f"No of Audios: {data.get('No of Audios', 0)}")
156
- doc.add_paragraph(f"Likes: {data.get('Likes', 'N/A')}")
157
- doc.add_paragraph(f"Comments: {data.get('Comments', 'N/A')}")
158
- doc.add_paragraph(f"Tagged Audience: {data.get('Tagged Audience', 'No')}")
159
- doc.add_paragraph(f"Caption: {data.get('Full Caption', 'N/A')}")
160
- doc.add_paragraph(f"Language of Caption: {data.get('Language', 'N/A')}")
161
- doc.add_paragraph(f"Total No of Hashtags: {len(data.get('Hashtags', []))}")
162
-
163
- if data.get('Hashtags'):
164
- doc.add_paragraph(f"Hashtags: {', '.join(data['Hashtags'])}")
165
- else:
166
- doc.add_paragraph("Hashtags: N/A")
167
-
168
- # Adding Frames for each post
169
- doc.add_heading("Frames", level=2)
170
- if data.get("Frames"):
171
- for frame in data['Frames']:
172
- doc.add_paragraph(f"- {frame}")
173
- else:
174
- doc.add_paragraph("No Frames available")
175
 
176
- doc.add_paragraph("\n") # Add a space between posts
177
-
178
- # Save the document
179
- doc.save(output_path)
180
-
181
- # Streamlit app setup
182
  st.title("AI-Powered Activism Message Analyzer")
183
 
184
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
@@ -195,18 +199,60 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
195
  # Initialize output dictionary
196
  output_data = {}
197
 
198
- # Extract and process data based on file uploads or input text
 
 
 
 
 
 
 
 
 
 
 
199
  if uploaded_docx:
200
- output_data = extract_captions_from_docx(uploaded_docx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  if uploaded_excel:
202
- metadata = extract_metadata_from_excel(uploaded_excel)
203
- output_data = merge_metadata_with_generated_data(output_data, metadata)
 
 
 
 
 
 
204
 
205
- # Generate output
206
  if output_data:
207
- # Call the function to generate the DOCX report
208
- create_structured_output_without_table(output_data, "final_output.docx")
209
- st.write("The DOCX file has been created and saved!")
210
- st.download_button("Download DOCX", data=open("final_output.docx", "rb"), file_name="final_output.docx")
211
 
212
- # Further refinement can be added for additional features as necessary
 
 
 
 
 
 
98
  logging.error(f"Groq API error: {e}")
99
  return extract_frames_fallback(text)
100
 
101
+ # Fallback method for frame extraction (with categorization of Major, Significant, Minor)
102
  def extract_frames_fallback(text):
103
  detected_frames = set()
104
+ frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
105
  text_lower = text.lower()
106
+
107
  for category, keywords in frame_categories.items():
108
+ keyword_count = sum(word in text_lower for word in keywords)
109
+ if keyword_count > 3:
110
+ frame_focus["Major Focus"].append(category)
111
+ elif keyword_count > 1:
112
+ frame_focus["Significant Focus"].append(category)
113
+ elif keyword_count > 0:
114
+ frame_focus["Minor Mention"].append(category)
115
+
116
+ # Return categorized frames
117
+ for focus, categories in frame_focus.items():
118
+ for category in categories:
119
+ detected_frames.add(f"{focus}: {category}")
120
+
121
  return list(detected_frames)
122
 
123
  # Extract captions from DOCX
 
134
  captions[current_post].append(text)
135
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
136
 
137
+ # Function to extract metadata from an Excel file
138
  def extract_metadata_from_excel(excel_file):
139
  try:
140
  df = pd.read_excel(excel_file)
141
+ # Ensure the required columns are present
142
+ required_columns = ["Date", "Media Type", "Number of Pictures", "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience"]
143
+ if not all(col in df.columns for col in required_columns):
144
+ st.error("Excel file is missing required columns.")
145
+ return []
146
+ extracted_data = []
147
+ for index, row in df.iterrows():
148
+ post_data = {
149
+ "Post Number": f"Post {index + 1}",
150
+ "Date of Post": row.get("Date", "N/A"),
151
+ "Media Type": row.get("Media Type", "N/A"),
152
+ "Number of Pictures": row.get("Number of Pictures", 0),
153
+ "Number of Videos": row.get("Number of Videos", 0),
154
+ "Number of Audios": row.get("Number of Audios", 0),
155
+ "Likes": row.get("Likes", 0),
156
+ "Comments": row.get("Comments", 0),
157
+ "Tagged Audience": row.get("Tagged Audience", "No"),
158
+ }
159
+ extracted_data.append(post_data)
160
+ return extracted_data
161
  except Exception as e:
162
+ logging.error(f"Error processing Excel file: {e}")
163
+ return []
164
 
165
  # Merge metadata from Excel with the generated data
166
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
167
+ for post_data in excel_metadata:
168
+ post_number = post_data["Post Number"]
169
+ if post_number in generated_data:
170
+ generated_data[post_number].update(post_data)
171
+ else:
172
+ generated_data[post_number] = post_data
173
  return generated_data
174
 
175
+ # Function to create DOCX from extracted data
176
+ def create_docx_from_data(extracted_data):
177
  doc = Document()
178
+ for post_number, data in extracted_data.items():
179
+ doc.add_heading(post_number, level=1)
180
+ for key, value in data.items():
181
+ doc.add_paragraph(f"{key}: {value}")
182
+ doc.add_paragraph("\n") # Add a line break between posts
183
+ return doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ # Streamlit app
 
 
 
 
 
186
  st.title("AI-Powered Activism Message Analyzer")
187
 
188
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
 
199
  # Initialize output dictionary
200
  output_data = {}
201
 
202
+ # Process Text Input
203
+ if input_text:
204
+ output_data["Manual Input"] = {
205
+ "Full Caption": input_text,
206
+ "Language": detect_language(input_text),
207
+ "Tone": extract_tone(input_text),
208
+ "Hashtags": extract_hashtags(input_text),
209
+ "Frames": extract_frames(input_text),
210
+ }
211
+ st.success("Analysis completed for text input.")
212
+
213
+ # Process DOCX file
214
  if uploaded_docx:
215
+ captions = extract_captions_from_docx(uploaded_docx)
216
+ for caption, text in captions.items():
217
+ output_data[caption] = {
218
+ "Full Caption": text,
219
+ "Language": detect_language(text),
220
+ "Tone": extract_tone(text),
221
+ "Hashtags": extract_hashtags(text),
222
+ "Frames": extract_frames(text),
223
+ }
224
+ st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
225
+
226
+ # Process Excel file
227
+ if uploaded_excel:
228
+ with st.spinner("Processing Excel file..."):
229
+ excel_metadata = extract_metadata_from_excel(uploaded_excel)
230
+ if excel_metadata:
231
+ st.success(f"Excel metadata extracted with {len(excel_metadata)} posts.")
232
+ else:
233
+ st.warning("No valid data extracted from the Excel file.")
234
+
235
+ # Merge and display final data
236
  if uploaded_excel:
237
+ output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
238
+
239
+ # Display results in collapsible sections for better UI
240
+ if output_data:
241
+ for post_number, data in output_data.items():
242
+ with st.expander(post_number):
243
+ for key, value in data.items():
244
+ st.write(f"**{key}:** {value}")
245
 
246
+ # Create DOCX file for download
247
  if output_data:
248
+ doc = create_docx_from_data(output_data)
249
+ docx_io = io.BytesIO()
250
+ doc.save(docx_io)
251
+ docx_io.seek(0)
252
 
253
+ st.download_button(
254
+ label="Download Extracted Data as DOCX",
255
+ data=docx_io,
256
+ file_name="extracted_data.docx",
257
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
258
+ )