Update app.py
Browse files
app.py
CHANGED
@@ -65,7 +65,10 @@ frame_categories = {
|
|
65 |
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
|
66 |
}
|
67 |
|
68 |
-
#
|
|
|
|
|
|
|
69 |
def detect_language(text):
|
70 |
try:
|
71 |
return detect(text)
|
@@ -73,7 +76,6 @@ def detect_language(text):
|
|
73 |
logging.error(f"Error detecting language: {e}")
|
74 |
return "unknown"
|
75 |
|
76 |
-
# Extract tone using Groq API (or fallback method)
|
77 |
def extract_tone(text):
|
78 |
try:
|
79 |
response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
|
@@ -83,7 +85,6 @@ def extract_tone(text):
|
|
83 |
logging.error(f"Groq API error: {e}")
|
84 |
return extract_tone_fallback(text)
|
85 |
|
86 |
-
# Fallback method for tone extraction
|
87 |
def extract_tone_fallback(text):
|
88 |
detected_tones = set()
|
89 |
text_lower = text.lower()
|
@@ -92,49 +93,26 @@ def extract_tone_fallback(text):
|
|
92 |
detected_tones.add(category)
|
93 |
return list(detected_tones) if detected_tones else ["Neutral"]
|
94 |
|
95 |
-
# Extract hashtags
|
96 |
def extract_hashtags(text):
|
97 |
return re.findall(r"#\w+", text)
|
98 |
|
99 |
# -------------------------------------------------------------------
|
100 |
-
# New functions for frame categorization
|
101 |
# -------------------------------------------------------------------
|
102 |
|
103 |
def get_frame_category_mapping(text):
|
104 |
"""
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
- Next up to two: "Significant Focus"
|
109 |
-
- Remaining detected: "Minor Mention"
|
110 |
-
Frames not detected get "Not Applicable".
|
111 |
"""
|
112 |
-
|
113 |
-
# Calculate frequency for each frame
|
114 |
-
frame_freq = {}
|
115 |
-
for frame, keywords in frame_categories.items():
|
116 |
-
freq = sum(1 for word in keywords if word in text_lower)
|
117 |
-
frame_freq[frame] = freq
|
118 |
-
|
119 |
-
# Identify detected frames (frequency > 0) and sort descending
|
120 |
-
detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
|
121 |
-
detected.sort(key=lambda x: x[1], reverse=True)
|
122 |
-
|
123 |
-
category_mapping = {}
|
124 |
-
if detected:
|
125 |
-
# Highest frequency frame as Major Focus
|
126 |
-
category_mapping[detected[0][0]] = "Major Focus"
|
127 |
-
# Next up to two frames as Significant Focus
|
128 |
-
for frame, _ in detected[1:3]:
|
129 |
-
category_mapping[frame] = "Significant Focus"
|
130 |
-
# Remaining detected frames as Minor Mention
|
131 |
-
for frame, _ in detected[3:]:
|
132 |
-
category_mapping[frame] = "Minor Mention"
|
133 |
-
# For frames not detected, assign Not Applicable
|
134 |
for frame in frame_categories.keys():
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
138 |
|
139 |
def format_frame_categories_table(mapping):
|
140 |
"""
|
@@ -158,7 +136,6 @@ def format_frame_categories_table(mapping):
|
|
158 |
# Existing functions for file processing
|
159 |
# -------------------------------------------------------------------
|
160 |
|
161 |
-
# Extract captions from DOCX
|
162 |
def extract_captions_from_docx(docx_file):
|
163 |
doc = Document(docx_file)
|
164 |
captions = {}
|
@@ -172,7 +149,6 @@ def extract_captions_from_docx(docx_file):
|
|
172 |
captions[current_post].append(text)
|
173 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
174 |
|
175 |
-
# Extract metadata from Excel file
|
176 |
def extract_metadata_from_excel(excel_file):
|
177 |
try:
|
178 |
df = pd.read_excel(excel_file)
|
@@ -182,7 +158,6 @@ def extract_metadata_from_excel(excel_file):
|
|
182 |
logging.error(f"Error processing Excel file: {e}")
|
183 |
return []
|
184 |
|
185 |
-
# Merge metadata with generated analysis
|
186 |
def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
187 |
for post_data in excel_metadata:
|
188 |
post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
|
@@ -192,7 +167,6 @@ def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
|
192 |
generated_data[post_number] = post_data
|
193 |
return generated_data
|
194 |
|
195 |
-
# Create DOCX file matching the uploaded format
|
196 |
def create_docx_from_data(extracted_data):
|
197 |
doc = Document()
|
198 |
for post_number, data in extracted_data.items():
|
@@ -206,7 +180,6 @@ def create_docx_from_data(extracted_data):
|
|
206 |
value = data.get(key, "N/A")
|
207 |
if key in ["Tone", "Hashtags"]:
|
208 |
value = ", ".join(value) if isinstance(value, list) else value
|
209 |
-
# For Frames, simply add the table text as is.
|
210 |
doc.add_paragraph(f"**{key}:** {value}")
|
211 |
doc.add_paragraph("\n")
|
212 |
return doc
|
@@ -225,7 +198,6 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
|
225 |
output_data = {}
|
226 |
|
227 |
if input_text:
|
228 |
-
# Process manual input text
|
229 |
frame_mapping = get_frame_category_mapping(input_text)
|
230 |
frames_table = format_frame_categories_table(frame_mapping)
|
231 |
output_data["Manual Input"] = {
|
@@ -233,7 +205,7 @@ if input_text:
|
|
233 |
"Language": detect_language(input_text),
|
234 |
"Tone": extract_tone(input_text),
|
235 |
"Hashtags": extract_hashtags(input_text),
|
236 |
-
"Frames": frames_table,
|
237 |
}
|
238 |
|
239 |
if uploaded_docx:
|
@@ -253,7 +225,6 @@ if uploaded_excel:
|
|
253 |
excel_metadata = extract_metadata_from_excel(uploaded_excel)
|
254 |
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
|
255 |
|
256 |
-
# Display results in collapsible sections
|
257 |
if output_data:
|
258 |
for post_number, data in output_data.items():
|
259 |
with st.expander(post_number):
|
@@ -263,7 +234,6 @@ if output_data:
|
|
263 |
else:
|
264 |
st.write(f"**{key}:** {value}")
|
265 |
|
266 |
-
# Generate DOCX output for download
|
267 |
if output_data:
|
268 |
docx_output = create_docx_from_data(output_data)
|
269 |
docx_io = io.BytesIO()
|
|
|
65 |
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
|
66 |
}
|
67 |
|
68 |
+
# Initialize zero-shot classifier for qualitative frame categorization
|
69 |
+
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
70 |
+
candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
|
71 |
+
|
72 |
def detect_language(text):
|
73 |
try:
|
74 |
return detect(text)
|
|
|
76 |
logging.error(f"Error detecting language: {e}")
|
77 |
return "unknown"
|
78 |
|
|
|
79 |
def extract_tone(text):
|
80 |
try:
|
81 |
response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
|
|
|
85 |
logging.error(f"Groq API error: {e}")
|
86 |
return extract_tone_fallback(text)
|
87 |
|
|
|
88 |
def extract_tone_fallback(text):
|
89 |
detected_tones = set()
|
90 |
text_lower = text.lower()
|
|
|
93 |
detected_tones.add(category)
|
94 |
return list(detected_tones) if detected_tones else ["Neutral"]
|
95 |
|
|
|
96 |
def extract_hashtags(text):
|
97 |
return re.findall(r"#\w+", text)
|
98 |
|
99 |
# -------------------------------------------------------------------
|
100 |
+
# New functions for qualitative frame categorization using zero-shot classification
|
101 |
# -------------------------------------------------------------------
|
102 |
|
103 |
def get_frame_category_mapping(text):
|
104 |
"""
|
105 |
+
For each frame category defined in frame_categories, this function uses a zero-shot classification
|
106 |
+
approach to qualitatively assess how strongly the text discusses the frame. The classifier returns one of:
|
107 |
+
"Major Focus", "Significant Focus", "Minor Mention", or "Not Applicable".
|
|
|
|
|
|
|
108 |
"""
|
109 |
+
mapping = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
for frame in frame_categories.keys():
|
111 |
+
hypothesis_template = f"This text is {{}} about {frame}."
|
112 |
+
result = classifier(text, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template)
|
113 |
+
best_label = result["labels"][0] # select the highest scoring label
|
114 |
+
mapping[frame] = best_label
|
115 |
+
return mapping
|
116 |
|
117 |
def format_frame_categories_table(mapping):
|
118 |
"""
|
|
|
136 |
# Existing functions for file processing
|
137 |
# -------------------------------------------------------------------
|
138 |
|
|
|
139 |
def extract_captions_from_docx(docx_file):
|
140 |
doc = Document(docx_file)
|
141 |
captions = {}
|
|
|
149 |
captions[current_post].append(text)
|
150 |
return {post: " ".join(lines) for post, lines in captions.items() if lines}
|
151 |
|
|
|
152 |
def extract_metadata_from_excel(excel_file):
|
153 |
try:
|
154 |
df = pd.read_excel(excel_file)
|
|
|
158 |
logging.error(f"Error processing Excel file: {e}")
|
159 |
return []
|
160 |
|
|
|
161 |
def merge_metadata_with_generated_data(generated_data, excel_metadata):
|
162 |
for post_data in excel_metadata:
|
163 |
post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
|
|
|
167 |
generated_data[post_number] = post_data
|
168 |
return generated_data
|
169 |
|
|
|
170 |
def create_docx_from_data(extracted_data):
|
171 |
doc = Document()
|
172 |
for post_number, data in extracted_data.items():
|
|
|
180 |
value = data.get(key, "N/A")
|
181 |
if key in ["Tone", "Hashtags"]:
|
182 |
value = ", ".join(value) if isinstance(value, list) else value
|
|
|
183 |
doc.add_paragraph(f"**{key}:** {value}")
|
184 |
doc.add_paragraph("\n")
|
185 |
return doc
|
|
|
198 |
output_data = {}
|
199 |
|
200 |
if input_text:
|
|
|
201 |
frame_mapping = get_frame_category_mapping(input_text)
|
202 |
frames_table = format_frame_categories_table(frame_mapping)
|
203 |
output_data["Manual Input"] = {
|
|
|
205 |
"Language": detect_language(input_text),
|
206 |
"Tone": extract_tone(input_text),
|
207 |
"Hashtags": extract_hashtags(input_text),
|
208 |
+
"Frames": frames_table,
|
209 |
}
|
210 |
|
211 |
if uploaded_docx:
|
|
|
225 |
excel_metadata = extract_metadata_from_excel(uploaded_excel)
|
226 |
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
|
227 |
|
|
|
228 |
if output_data:
|
229 |
for post_number, data in output_data.items():
|
230 |
with st.expander(post_number):
|
|
|
234 |
else:
|
235 |
st.write(f"**{key}:** {value}")
|
236 |
|
|
|
237 |
if output_data:
|
238 |
docx_output = create_docx_from_data(output_data)
|
239 |
docx_io = io.BytesIO()
|