Update app.py
Browse files
app.py
CHANGED
@@ -90,6 +90,10 @@ def extract_tone_fallback(text):
|
|
90 |
def extract_hashtags(text):
|
91 |
return re.findall(r"#\w+", text)
|
92 |
|
|
|
|
|
|
|
|
|
93 |
# Categorize frames into Major, Significant, and Minor based on frequency
|
94 |
def categorize_frames(frame_list):
|
95 |
frame_counter = Counter(frame_list)
|
@@ -98,9 +102,9 @@ def categorize_frames(frame_list):
|
|
98 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
99 |
|
100 |
for i, (frame, count) in enumerate(sorted_frames):
|
101 |
-
if i == 0:
|
102 |
categorized_frames["Major Focus"].append(frame)
|
103 |
-
elif i < 3:
|
104 |
categorized_frames["Significant Focus"].append(frame)
|
105 |
else:
|
106 |
categorized_frames["Minor Mention"].append(frame)
|
@@ -119,6 +123,16 @@ def extract_frames_fallback(text):
|
|
119 |
|
120 |
return categorize_frames(detected_frames)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
# Extract captions from DOCX
|
123 |
def extract_captions_from_docx(docx_file):
|
124 |
doc = Document(docx_file)
|
@@ -133,17 +147,25 @@ def extract_captions_from_docx(docx_file):
|
|
133 |
captions[current_post].append(text)
|
134 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
135 |
|
136 |
-
#
|
137 |
-
def
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
|
|
|
|
|
|
|
|
|
147 |
def create_docx_from_data(extracted_data):
|
148 |
doc = Document()
|
149 |
|
@@ -159,21 +181,14 @@ def create_docx_from_data(extracted_data):
|
|
159 |
value = data.get(field, "N/A")
|
160 |
doc.add_paragraph(f"**{field}:** {value}")
|
161 |
|
162 |
-
|
163 |
-
doc.add_paragraph(f"**Caption:** {caption_text}")
|
164 |
-
|
165 |
-
language = data.get("Language", "N/A")
|
166 |
-
doc.add_paragraph(f"**Language:** {language}")
|
167 |
-
|
168 |
-
tone = ", ".join(data.get("Tone", ["N/A"]))
|
169 |
-
doc.add_paragraph(f"**Tone:** {tone}")
|
170 |
|
171 |
-
|
172 |
-
doc.add_paragraph(f"**
|
|
|
173 |
|
174 |
frames = data.get("Frames", {})
|
175 |
doc.add_paragraph("**Frames:**")
|
176 |
-
|
177 |
for category, frame_list in frames.items():
|
178 |
if frame_list:
|
179 |
doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
|
@@ -185,41 +200,16 @@ def create_docx_from_data(extracted_data):
|
|
185 |
# Streamlit app
|
186 |
st.title("AI-Powered Activism Message Analyzer")
|
187 |
|
188 |
-
st.write("Enter text or upload a DOCX/Excel file for analysis:")
|
189 |
-
|
190 |
-
input_text = st.text_area("Input Text", height=200)
|
191 |
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
|
192 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
"Full Caption": input_text,
|
202 |
-
"Language": detect_language(input_text),
|
203 |
-
"Tone": extract_tone(input_text),
|
204 |
-
"Hashtags": extract_hashtags(input_text),
|
205 |
-
"Frames": extract_frames_fallback(input_text),
|
206 |
-
}
|
207 |
-
output_data.append(text_analysis)
|
208 |
-
|
209 |
-
if uploaded_docx:
|
210 |
-
captions = extract_captions_from_docx(uploaded_docx)
|
211 |
-
for caption, text in captions.items():
|
212 |
-
text_analysis = {
|
213 |
-
"Full Caption": text,
|
214 |
-
"Language": detect_language(text),
|
215 |
-
"Tone": extract_tone(text),
|
216 |
-
"Hashtags": extract_hashtags(text),
|
217 |
-
"Frames": extract_frames_fallback(text),
|
218 |
-
}
|
219 |
-
output_data.append(text_analysis)
|
220 |
-
|
221 |
-
if output_data:
|
222 |
-
docx_output = create_docx_from_data(output_data)
|
223 |
docx_io = io.BytesIO()
|
224 |
docx_output.save(docx_io)
|
225 |
docx_io.seek(0)
|
|
|
90 |
def extract_hashtags(text):
|
91 |
return re.findall(r"#\w+", text)
|
92 |
|
93 |
+
# Extract hashtags
|
94 |
+
def extract_hashtags(text):
|
95 |
+
return re.findall(r"#\w+", text)
|
96 |
+
|
97 |
# Categorize frames into Major, Significant, and Minor based on frequency
|
98 |
def categorize_frames(frame_list):
|
99 |
frame_counter = Counter(frame_list)
|
|
|
102 |
sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
|
103 |
|
104 |
for i, (frame, count) in enumerate(sorted_frames):
|
105 |
+
if i == 0:
|
106 |
categorized_frames["Major Focus"].append(frame)
|
107 |
+
elif i < 3:
|
108 |
categorized_frames["Significant Focus"].append(frame)
|
109 |
else:
|
110 |
categorized_frames["Minor Mention"].append(frame)
|
|
|
123 |
|
124 |
return categorize_frames(detected_frames)
|
125 |
|
126 |
+
# Extract metadata from Excel file
|
127 |
+
def extract_metadata_from_excel(excel_file):
|
128 |
+
try:
|
129 |
+
df = pd.read_excel(excel_file)
|
130 |
+
extracted_data = df.to_dict(orient="records")
|
131 |
+
return extracted_data
|
132 |
+
except Exception as e:
|
133 |
+
logging.error(f"Error processing Excel file: {e}")
|
134 |
+
return []
|
135 |
+
|
136 |
# Extract captions from DOCX
|
137 |
def extract_captions_from_docx(docx_file):
|
138 |
doc = Document(docx_file)
|
|
|
147 |
captions[current_post].append(text)
|
148 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
149 |
|
150 |
+
# Merge metadata and captions together
|
151 |
+
def merge_metadata_with_captions(metadata, captions):
|
152 |
+
merged_data = []
|
153 |
+
for i, meta in enumerate(metadata):
|
154 |
+
post_number = f"Post {i+1}"
|
155 |
+
caption_text = captions.get(post_number, "No caption available")
|
156 |
+
|
157 |
+
post_data = meta.copy()
|
158 |
+
post_data["Full Caption"] = caption_text
|
159 |
+
post_data["Language"] = detect_language(caption_text)
|
160 |
+
post_data["Tone"] = extract_tone(caption_text)
|
161 |
+
post_data["Hashtags"] = extract_hashtags(caption_text)
|
162 |
+
post_data["Frames"] = extract_frames_fallback(caption_text)
|
163 |
|
164 |
+
merged_data.append(post_data)
|
165 |
+
|
166 |
+
return merged_data
|
167 |
+
|
168 |
+
# Create DOCX file with correct formatting
|
169 |
def create_docx_from_data(extracted_data):
|
170 |
doc = Document()
|
171 |
|
|
|
181 |
value = data.get(field, "N/A")
|
182 |
doc.add_paragraph(f"**{field}:** {value}")
|
183 |
|
184 |
+
doc.add_paragraph(f"**Caption:** {data.get('Full Caption', 'N/A')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
doc.add_paragraph(f"**Language:** {data.get('Language', 'N/A')}")
|
187 |
+
doc.add_paragraph(f"**Tone:** {', '.join(data.get('Tone', ['N/A']))}")
|
188 |
+
doc.add_paragraph(f"**Hashtags:** {', '.join(data.get('Hashtags', []))}")
|
189 |
|
190 |
frames = data.get("Frames", {})
|
191 |
doc.add_paragraph("**Frames:**")
|
|
|
192 |
for category, frame_list in frames.items():
|
193 |
if frame_list:
|
194 |
doc.add_paragraph(f" {category}: {', '.join(frame_list)}")
|
|
|
200 |
# Streamlit app
|
201 |
st.title("AI-Powered Activism Message Analyzer")
|
202 |
|
|
|
|
|
|
|
203 |
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
|
204 |
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
205 |
|
206 |
+
if uploaded_excel and uploaded_docx:
|
207 |
+
excel_metadata = extract_metadata_from_excel(uploaded_excel)
|
208 |
+
docx_captions = extract_captions_from_docx(uploaded_docx)
|
209 |
+
|
210 |
+
merged_data = merge_metadata_with_captions(excel_metadata, docx_captions)
|
211 |
+
docx_output = create_docx_from_data(merged_data)
|
212 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
docx_io = io.BytesIO()
|
214 |
docx_output.save(docx_io)
|
215 |
docx_io.seek(0)
|