ahm14 commited on
Commit
0d3d327
·
verified ·
1 Parent(s): bd7a5fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -118
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import streamlit as st
3
  import re
@@ -6,39 +7,32 @@ import nltk
6
  from docx import Document
7
  import io
8
  from langdetect import detect
9
- from transformers import pipeline
10
  from dotenv import load_dotenv
11
  from langchain_groq import ChatGroq
12
  from langchain_core.output_parsers import StrOutputParser
13
  from langchain_core.prompts import ChatPromptTemplate
 
14
 
15
  # Load environment variables
16
  load_dotenv()
17
 
 
 
 
 
 
 
18
  # Initialize logging
19
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
 
21
  # Initialize LLM (Groq API)
22
- llm = ChatGroq(temperature=0.5, groq_api_key="GROQ_API_KEY", model_name="llama3-8b-8192")
23
 
24
  # Download required NLTK resources
25
  nltk.download("punkt")
26
 
27
- # Tone categories for fallback method
28
- tone_categories = {
29
- "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
30
- "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
31
- "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
32
- "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
33
- "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
34
- "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
35
- "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
36
- "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
37
- "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
38
- "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
39
- }
40
-
41
- # Frame categories for fallback method
42
  frame_categories = {
43
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
44
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -48,13 +42,6 @@ frame_categories = {
48
  "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
49
  "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
50
  "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
51
- "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
52
- "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
53
- "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
54
- "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
55
- "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
56
- "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
57
- "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
58
  }
59
 
60
  # Detect language
@@ -65,7 +52,7 @@ def detect_language(text):
65
  logging.error(f"Error detecting language: {e}")
66
  return "unknown"
67
 
68
- # Extract tone using Groq API (or fallback method)
69
  def extract_tone(text):
70
  try:
71
  response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
@@ -73,66 +60,40 @@ def extract_tone(text):
73
  return response["choices"][0]["message"]["content"].split(", ")
74
  except Exception as e:
75
  logging.error(f"Groq API error: {e}")
76
- return extract_tone_fallback(text)
77
-
78
- # Fallback method for tone extraction
79
- def extract_tone_fallback(text):
80
- detected_tones = set()
81
- text_lower = text.lower()
82
- for category, keywords in tone_categories.items():
83
- if any(word in text_lower for word in keywords):
84
- detected_tones.add(category)
85
- return list(detected_tones) if detected_tones else ["Neutral"]
86
 
87
  # Extract hashtags
88
  def extract_hashtags(text):
89
  return re.findall(r"#\w+", text)
90
 
91
- # Extract frames using Groq API (with categorization: Major Focus, Significant Focus, Minor Mention)
92
- def extract_frames(text):
93
- try:
94
- # Prompt Groq to categorize frames and their focus
95
- response = llm.chat([{"role": "system", "content": "Classify the following text into relevant activism frames and categorize each frame as Major Focus, Significant Focus, or Minor Mention."},
96
- {"role": "user", "content": text}])
97
- return parse_frames(response["choices"][0]["message"]["content"])
98
- except Exception as e:
99
- logging.error(f"Groq API error: {e}")
100
- return extract_frames_fallback(text)
101
 
102
- # Function to parse Groq response and categorize frames
103
- def parse_frames(response_text):
104
- frame_data = {}
105
- lines = response_text.splitlines()
106
- for line in lines:
107
- if "Major Focus" in line or "Significant Focus" in line or "Minor Mention" in line:
108
- category = line.split(":")[0].strip()
109
- frame = line.split(":")[1].strip()
110
- if category not in frame_data:
111
- frame_data[category] = []
112
- frame_data[category].append(frame)
113
- return frame_data
114
 
115
- # Fallback method for frame extraction (with categorization of Major, Significant, Minor)
116
  def extract_frames_fallback(text):
117
- detected_frames = set()
118
- frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
119
  text_lower = text.lower()
120
-
121
  for category, keywords in frame_categories.items():
122
- keyword_count = sum(word in text_lower for word in keywords)
123
- if keyword_count > 3:
124
- frame_focus["Major Focus"].append(category)
125
- elif keyword_count > 1:
126
- frame_focus["Significant Focus"].append(category)
127
- elif keyword_count > 0:
128
- frame_focus["Minor Mention"].append(category)
129
-
130
- # Return categorized frames
131
- for focus, categories in frame_focus.items():
132
- for category in categories:
133
- detected_frames.add(f"{focus}: {category}")
134
-
135
- return list(detected_frames)
136
 
137
  # Extract captions from DOCX
138
  def extract_captions_from_docx(docx_file):
@@ -148,7 +109,7 @@ def extract_captions_from_docx(docx_file):
148
  captions[current_post].append(text)
149
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
150
 
151
- # Function to extract metadata from an Excel file
152
  def extract_metadata_from_excel(excel_file):
153
  try:
154
  df = pd.read_excel(excel_file)
@@ -156,6 +117,7 @@ def extract_metadata_from_excel(excel_file):
156
  if not all(col in df.columns for col in required_columns):
157
  st.error("Excel file is missing required columns.")
158
  return []
 
159
  extracted_data = []
160
  for index, row in df.iterrows():
161
  post_data = {
@@ -175,28 +137,25 @@ def extract_metadata_from_excel(excel_file):
175
  logging.error(f"Error processing Excel file: {e}")
176
  return []
177
 
178
-
179
- # Merge metadata from Excel with the generated data
180
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
181
- # Loop through the Excel data and merge it with the generated data
182
  for post_data in excel_metadata:
183
  post_number = post_data["Post Number"]
184
  if post_number in generated_data:
185
- # If the post exists in both, merge Excel and Word data
186
  generated_data[post_number].update(post_data)
187
  else:
188
- # If the post exists only in Excel, create a new entry in generated data
189
- generated_data[post_number] = post_data
190
  return generated_data
191
 
192
- # Function to create DOCX from merged data
193
  def create_docx_from_data(extracted_data):
194
  doc = Document()
195
  for post_number, data in extracted_data.items():
196
  doc.add_heading(post_number, level=1)
197
  for key, value in data.items():
198
- doc.add_paragraph(f"{key}: {value}")
199
- doc.add_paragraph("\n") # Add a line break between posts
200
  return doc
201
 
202
  # Streamlit app
@@ -204,31 +163,21 @@ st.title("AI-Powered Activism Message Analyzer")
204
 
205
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
206
 
207
- # Text input
208
  input_text = st.text_area("Input Text", height=200)
209
-
210
- # File upload (DOCX)
211
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
212
-
213
- # File upload (Excel)
214
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
215
 
216
-
217
- # Initialize output dictionary
218
  output_data = {}
219
 
220
- # Process Text Input
221
  if input_text:
222
  output_data["Manual Input"] = {
223
  "Full Caption": input_text,
224
  "Language": detect_language(input_text),
225
  "Tone": extract_tone(input_text),
226
  "Hashtags": extract_hashtags(input_text),
227
- "Frames": extract_frames(input_text),
228
  }
229
- st.success("Analysis completed for text input.")
230
 
231
- # Process DOCX file
232
  if uploaded_docx:
233
  captions = extract_captions_from_docx(uploaded_docx)
234
  for caption, text in captions.items():
@@ -237,35 +186,16 @@ if uploaded_docx:
237
  "Language": detect_language(text),
238
  "Tone": extract_tone(text),
239
  "Hashtags": extract_hashtags(text),
240
- "Frames": extract_frames(text),
241
  }
242
- st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
243
 
244
- # Process Excel file
245
- if uploaded_excel:
246
- with st.spinner("Processing Excel file..."):
247
- excel_metadata = extract_metadata_from_excel(uploaded_excel)
248
- if excel_metadata:
249
- st.success(f"Excel metadata extracted with {len(excel_metadata)} posts.")
250
- else:
251
- st.warning("No valid data extracted from the Excel file.")
252
-
253
- # Merge the Word and Excel data
254
  if uploaded_excel:
 
255
  output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
256
 
257
- # Display results in collapsible sections for better UI
258
- if output_data:
259
- for post_number, data in output_data.items():
260
- with st.expander(post_number):
261
- for key, value in data.items():
262
- st.write(f"**{key}:** {value}")
263
-
264
- # Allow downloading the merged DOCX file
265
  if output_data:
266
  docx_output = create_docx_from_data(output_data)
267
  docx_io = io.BytesIO()
268
  docx_output.save(docx_io)
269
  docx_io.seek(0)
270
- st.download_button(label="Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")
271
-
 
1
+ import os
2
  import pandas as pd
3
  import streamlit as st
4
  import re
 
7
  from docx import Document
8
  import io
9
  from langdetect import detect
10
+ from collections import Counter
11
  from dotenv import load_dotenv
12
  from langchain_groq import ChatGroq
13
  from langchain_core.output_parsers import StrOutputParser
14
  from langchain_core.prompts import ChatPromptTemplate
15
+ from transformers import pipeline
16
 
17
  # Load environment variables
18
  load_dotenv()
19
 
20
+ # Check if Groq API key is available
21
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
22
+ if not GROQ_API_KEY:
23
+ logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
24
+ st.error("API key is missing. Please provide a valid API key.")
25
+
26
  # Initialize logging
27
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
28
 
29
  # Initialize LLM (Groq API)
30
+ llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
31
 
32
  # Download required NLTK resources
33
  nltk.download("punkt")
34
 
35
+ # Frame categories with keywords
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  frame_categories = {
37
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
38
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
 
42
  "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
43
  "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
44
  "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
 
 
 
 
 
 
 
45
  }
46
 
47
  # Detect language
 
52
  logging.error(f"Error detecting language: {e}")
53
  return "unknown"
54
 
55
+ # Extract tone using Groq API
56
  def extract_tone(text):
57
  try:
58
  response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
 
60
  return response["choices"][0]["message"]["content"].split(", ")
61
  except Exception as e:
62
  logging.error(f"Groq API error: {e}")
63
+ return ["Neutral"]
 
 
 
 
 
 
 
 
 
64
 
65
  # Extract hashtags
66
  def extract_hashtags(text):
67
  return re.findall(r"#\w+", text)
68
 
69
+ # Categorize frames into Major, Significant, and Minor based on frequency
70
+ def categorize_frames(frame_list):
71
+ frame_counter = Counter(frame_list)
72
+ categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
 
 
 
 
 
 
73
 
74
+ sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
75
+
76
+ for i, (frame, count) in enumerate(sorted_frames):
77
+ if i == 0: # Highest frequency frame
78
+ categorized_frames["Major Focus"].append(frame)
79
+ elif i < 3: # Top 3 most mentioned frames
80
+ categorized_frames["Significant Focus"].append(frame)
81
+ else:
82
+ categorized_frames["Minor Mention"].append(frame)
83
+
84
+ return categorized_frames
 
85
 
86
+ # Extract frames using keyword matching and categorize
87
  def extract_frames_fallback(text):
88
+ detected_frames = []
 
89
  text_lower = text.lower()
90
+
91
  for category, keywords in frame_categories.items():
92
+ keyword_count = sum(1 for word in keywords if word in text_lower)
93
+ if keyword_count > 0:
94
+ detected_frames.append(category)
95
+
96
+ return categorize_frames(detected_frames)
 
 
 
 
 
 
 
 
 
97
 
98
  # Extract captions from DOCX
99
  def extract_captions_from_docx(docx_file):
 
109
  captions[current_post].append(text)
110
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
111
 
112
+ # Extract metadata from Excel file
113
  def extract_metadata_from_excel(excel_file):
114
  try:
115
  df = pd.read_excel(excel_file)
 
117
  if not all(col in df.columns for col in required_columns):
118
  st.error("Excel file is missing required columns.")
119
  return []
120
+
121
  extracted_data = []
122
  for index, row in df.iterrows():
123
  post_data = {
 
137
  logging.error(f"Error processing Excel file: {e}")
138
  return []
139
 
140
+ # Merge metadata with generated analysis
 
141
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
 
142
  for post_data in excel_metadata:
143
  post_number = post_data["Post Number"]
144
  if post_number in generated_data:
 
145
  generated_data[post_number].update(post_data)
146
  else:
147
+ generated_data[post_number] = post_data # Preserve metadata even if no text caption
148
+
149
  return generated_data
150
 
151
+ # Create DOCX file from extracted data
152
  def create_docx_from_data(extracted_data):
153
  doc = Document()
154
  for post_number, data in extracted_data.items():
155
  doc.add_heading(post_number, level=1)
156
  for key, value in data.items():
157
+ doc.add_paragraph(f"**{key}:** {value}")
158
+ doc.add_paragraph("\n")
159
  return doc
160
 
161
  # Streamlit app
 
163
 
164
  st.write("Enter text or upload a DOCX/Excel file for analysis:")
165
 
 
166
  input_text = st.text_area("Input Text", height=200)
 
 
167
  uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
 
 
168
  uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
169
 
 
 
170
  output_data = {}
171
 
 
172
  if input_text:
173
  output_data["Manual Input"] = {
174
  "Full Caption": input_text,
175
  "Language": detect_language(input_text),
176
  "Tone": extract_tone(input_text),
177
  "Hashtags": extract_hashtags(input_text),
178
+ "Frames": extract_frames_fallback(input_text),
179
  }
 
180
 
 
181
  if uploaded_docx:
182
  captions = extract_captions_from_docx(uploaded_docx)
183
  for caption, text in captions.items():
 
186
  "Language": detect_language(text),
187
  "Tone": extract_tone(text),
188
  "Hashtags": extract_hashtags(text),
189
+ "Frames": extract_frames_fallback(text),
190
  }
 
191
 
 
 
 
 
 
 
 
 
 
 
192
  if uploaded_excel:
193
+ excel_metadata = extract_metadata_from_excel(uploaded_excel)
194
  output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
195
 
 
 
 
 
 
 
 
 
196
  if output_data:
197
  docx_output = create_docx_from_data(output_data)
198
  docx_io = io.BytesIO()
199
  docx_output.save(docx_io)
200
  docx_io.seek(0)
201
+ st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")