ahm14 commited on
Commit
5893c88
·
verified ·
1 Parent(s): e38265e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -55
app.py CHANGED
@@ -90,6 +90,10 @@ def extract_tone_fallback(text):
90
  def extract_hashtags(text):
91
  return re.findall(r"#\w+", text)
92
 
 
 
 
 
93
  # Categorize frames into Major, Significant, and Minor based on frequency
94
  def categorize_frames(frame_list):
95
  frame_counter = Counter(frame_list)
@@ -98,9 +102,9 @@ def categorize_frames(frame_list):
98
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
99
 
100
  for i, (frame, count) in enumerate(sorted_frames):
101
- if i == 0:
102
  categorized_frames["Major Focus"].append(frame)
103
- elif i < 3:
104
  categorized_frames["Significant Focus"].append(frame)
105
  else:
106
  categorized_frames["Minor Mention"].append(frame)
@@ -119,6 +123,16 @@ def extract_frames_fallback(text):
119
 
120
  return categorize_frames(detected_frames)
121
 
 
 
 
 
 
 
 
 
 
 
122
  # Extract captions from DOCX
123
  def extract_captions_from_docx(docx_file):
124
  doc = Document(docx_file)
@@ -133,17 +147,25 @@ def extract_captions_from_docx(docx_file):
133
  captions[current_post].append(text)
134
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
135
 
136
- # Extract metadata from Excel file
137
- def extract_metadata_from_excel(excel_file):
138
- try:
139
- df = pd.read_excel(excel_file)
140
- extracted_data = df.to_dict(orient="records")
141
- return extracted_data
142
- except Exception as e:
143
- logging.error(f"Error processing Excel file: {e}")
144
- return []
 
 
 
 
145
 
146
- # Create DOCX file in the required format
 
 
 
 
147
  def create_docx_from_data(extracted_data):
148
  doc = Document()
149
 
@@ -159,21 +181,14 @@ def create_docx_from_data(extracted_data):
159
  value = data.get(field, "N/A")
160
  doc.add_paragraph(f"**{field}:** {value}")
161
 
162
- caption_text = data.get("Full Caption", "N/A")
163
- doc.add_paragraph(f"**Caption:** {caption_text}")
164
-
165
- language = data.get("Language", "N/A")
166
- doc.add_paragraph(f"**Language:** {language}")
167
-
168
- tone = ", ".join(data.get("Tone", ["N/A"]))
169
- doc.add_paragraph(f"**Tone:** {tone}")
170
 
171
- hashtags = ", ".join(data.get("Hashtags", []))
172
- doc.add_paragraph(f"**Hashtags:** {hashtags}")
 
173
 
174
  frames = data.get("Frames", {})
175
  doc.add_paragraph("**Frames:**")
176
-
177
  for category, frame_list in frames.items():
178
  if frame_list:
179
  doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
@@ -185,41 +200,16 @@ def create_docx_from_data(extracted_data):
185
  # Streamlit app
186
  st.title("AI-Powered Activism Message Analyzer")
187
 
188
- st.write("Enter text or upload a DOCX/Excel file for analysis:")
189
-
190
- input_text = st.text_area("Input Text", height=200)
191
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
192
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
193
 
194
- output_data = []
195
-
196
- if uploaded_excel:
197
- output_data = extract_metadata_from_excel(uploaded_excel)
198
-
199
- if input_text:
200
- text_analysis = {
201
- "Full Caption": input_text,
202
- "Language": detect_language(input_text),
203
- "Tone": extract_tone(input_text),
204
- "Hashtags": extract_hashtags(input_text),
205
- "Frames": extract_frames_fallback(input_text),
206
- }
207
- output_data.append(text_analysis)
208
-
209
- if uploaded_docx:
210
- captions = extract_captions_from_docx(uploaded_docx)
211
- for caption, text in captions.items():
212
- text_analysis = {
213
- "Full Caption": text,
214
- "Language": detect_language(text),
215
- "Tone": extract_tone(text),
216
- "Hashtags": extract_hashtags(text),
217
- "Frames": extract_frames_fallback(text),
218
- }
219
- output_data.append(text_analysis)
220
-
221
- if output_data:
222
- docx_output = create_docx_from_data(output_data)
223
  docx_io = io.BytesIO()
224
  docx_output.save(docx_io)
225
  docx_io.seek(0)
 
90
  def extract_hashtags(text):
91
  return re.findall(r"#\w+", text)
92
 
93
+ # Extract hashtags
94
+ def extract_hashtags(text):
95
+ return re.findall(r"#\w+", text)
96
+
97
  # Categorize frames into Major, Significant, and Minor based on frequency
98
  def categorize_frames(frame_list):
99
  frame_counter = Counter(frame_list)
 
102
  sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
103
 
104
  for i, (frame, count) in enumerate(sorted_frames):
105
+ if i == 0:
106
  categorized_frames["Major Focus"].append(frame)
107
+ elif i < 3:
108
  categorized_frames["Significant Focus"].append(frame)
109
  else:
110
  categorized_frames["Minor Mention"].append(frame)
 
123
 
124
  return categorize_frames(detected_frames)
125
 
126
+ # Extract metadata from Excel file
127
+ def extract_metadata_from_excel(excel_file):
128
+ try:
129
+ df = pd.read_excel(excel_file)
130
+ extracted_data = df.to_dict(orient="records")
131
+ return extracted_data
132
+ except Exception as e:
133
+ logging.error(f"Error processing Excel file: {e}")
134
+ return []
135
+
136
  # Extract captions from DOCX
137
  def extract_captions_from_docx(docx_file):
138
  doc = Document(docx_file)
 
147
  captions[current_post].append(text)
148
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
149
 
150
+ # Merge metadata and captions together
151
+ def merge_metadata_with_captions(metadata, captions):
152
+ merged_data = []
153
+ for i, meta in enumerate(metadata):
154
+ post_number = f"Post {i+1}"
155
+ caption_text = captions.get(post_number, "No caption available")
156
+
157
+ post_data = meta.copy()
158
+ post_data["Full Caption"] = caption_text
159
+ post_data["Language"] = detect_language(caption_text)
160
+ post_data["Tone"] = extract_tone(caption_text)
161
+ post_data["Hashtags"] = extract_hashtags(caption_text)
162
+ post_data["Frames"] = extract_frames_fallback(caption_text)
163
 
164
+ merged_data.append(post_data)
165
+
166
+ return merged_data
167
+
168
+ # Create DOCX file with correct formatting
169
  def create_docx_from_data(extracted_data):
170
  doc = Document()
171
 
 
181
  value = data.get(field, "N/A")
182
  doc.add_paragraph(f"**{field}:** {value}")
183
 
184
+ doc.add_paragraph(f"**Caption:** {data.get('Full Caption', 'N/A')}")
 
 
 
 
 
 
 
185
 
186
+ doc.add_paragraph(f"**Language:** {data.get('Language', 'N/A')}")
187
+ doc.add_paragraph(f"**Tone:** {', '.join(data.get('Tone', ['N/A']))}")
188
+ doc.add_paragraph(f"**Hashtags:** {', '.join(data.get('Hashtags', []))}")
189
 
190
  frames = data.get("Frames", {})
191
  doc.add_paragraph("**Frames:**")
 
192
  for category, frame_list in frames.items():
193
  if frame_list:
194
  doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
 
200
  # Streamlit app
201
  st.title("AI-Powered Activism Message Analyzer")
202
 
 
 
 
203
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
204
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
205
 
206
+ if uploaded_excel and uploaded_docx:
207
+ excel_metadata = extract_metadata_from_excel(uploaded_excel)
208
+ docx_captions = extract_captions_from_docx(uploaded_docx)
209
+
210
+ merged_data = merge_metadata_with_captions(excel_metadata, docx_captions)
211
+ docx_output = create_docx_from_data(merged_data)
212
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  docx_io = io.BytesIO()
214
  docx_output.save(docx_io)
215
  docx_io.seek(0)