ahm14 commited on
Commit
74033b7
·
verified ·
1 Parent(s): bba1b37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -45
app.py CHANGED
@@ -65,7 +65,10 @@ frame_categories = {
65
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
66
  }
67
 
68
- # Detect language
 
 
 
69
  def detect_language(text):
70
  try:
71
  return detect(text)
@@ -73,7 +76,6 @@ def detect_language(text):
73
  logging.error(f"Error detecting language: {e}")
74
  return "unknown"
75
 
76
- # Extract tone using Groq API (or fallback method)
77
  def extract_tone(text):
78
  try:
79
  response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
@@ -83,7 +85,6 @@ def extract_tone(text):
83
  logging.error(f"Groq API error: {e}")
84
  return extract_tone_fallback(text)
85
 
86
- # Fallback method for tone extraction
87
  def extract_tone_fallback(text):
88
  detected_tones = set()
89
  text_lower = text.lower()
@@ -92,49 +93,26 @@ def extract_tone_fallback(text):
92
  detected_tones.add(category)
93
  return list(detected_tones) if detected_tones else ["Neutral"]
94
 
95
- # Extract hashtags
96
  def extract_hashtags(text):
97
  return re.findall(r"#\w+", text)
98
 
99
  # -------------------------------------------------------------------
100
- # New functions for frame categorization and display
101
  # -------------------------------------------------------------------
102
 
103
  def get_frame_category_mapping(text):
104
  """
105
- Returns a mapping of every frame (from frame_categories) to one of the four categories.
106
- Detected frames are assigned a focus level based on keyword frequency:
107
- - Top detected: "Major Focus"
108
- - Next up to two: "Significant Focus"
109
- - Remaining detected: "Minor Mention"
110
- Frames not detected get "Not Applicable".
111
  """
112
- text_lower = text.lower()
113
- # Calculate frequency for each frame
114
- frame_freq = {}
115
- for frame, keywords in frame_categories.items():
116
- freq = sum(1 for word in keywords if word in text_lower)
117
- frame_freq[frame] = freq
118
-
119
- # Identify detected frames (frequency > 0) and sort descending
120
- detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
121
- detected.sort(key=lambda x: x[1], reverse=True)
122
-
123
- category_mapping = {}
124
- if detected:
125
- # Highest frequency frame as Major Focus
126
- category_mapping[detected[0][0]] = "Major Focus"
127
- # Next up to two frames as Significant Focus
128
- for frame, _ in detected[1:3]:
129
- category_mapping[frame] = "Significant Focus"
130
- # Remaining detected frames as Minor Mention
131
- for frame, _ in detected[3:]:
132
- category_mapping[frame] = "Minor Mention"
133
- # For frames not detected, assign Not Applicable
134
  for frame in frame_categories.keys():
135
- if frame not in category_mapping:
136
- category_mapping[frame] = "Not Applicable"
137
- return category_mapping
 
 
138
 
139
  def format_frame_categories_table(mapping):
140
  """
@@ -158,7 +136,6 @@ def format_frame_categories_table(mapping):
158
  # Existing functions for file processing
159
  # -------------------------------------------------------------------
160
 
161
- # Extract captions from DOCX
162
  def extract_captions_from_docx(docx_file):
163
  doc = Document(docx_file)
164
  captions = {}
@@ -172,7 +149,6 @@ def extract_captions_from_docx(docx_file):
172
  captions[current_post].append(text)
173
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
174
 
175
- # Extract metadata from Excel file
176
  def extract_metadata_from_excel(excel_file):
177
  try:
178
  df = pd.read_excel(excel_file)
@@ -182,7 +158,6 @@ def extract_metadata_from_excel(excel_file):
182
  logging.error(f"Error processing Excel file: {e}")
183
  return []
184
 
185
- # Merge metadata with generated analysis
186
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
187
  for post_data in excel_metadata:
188
  post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
@@ -192,7 +167,6 @@ def merge_metadata_with_generated_data(generated_data, excel_metadata):
192
  generated_data[post_number] = post_data
193
  return generated_data
194
 
195
- # Create DOCX file matching the uploaded format
196
  def create_docx_from_data(extracted_data):
197
  doc = Document()
198
  for post_number, data in extracted_data.items():
@@ -206,7 +180,6 @@ def create_docx_from_data(extracted_data):
206
  value = data.get(key, "N/A")
207
  if key in ["Tone", "Hashtags"]:
208
  value = ", ".join(value) if isinstance(value, list) else value
209
- # For Frames, simply add the table text as is.
210
  doc.add_paragraph(f"**{key}:** {value}")
211
  doc.add_paragraph("\n")
212
  return doc
@@ -225,7 +198,6 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
225
  output_data = {}
226
 
227
  if input_text:
228
- # Process manual input text
229
  frame_mapping = get_frame_category_mapping(input_text)
230
  frames_table = format_frame_categories_table(frame_mapping)
231
  output_data["Manual Input"] = {
@@ -233,7 +205,7 @@ if input_text:
233
  "Language": detect_language(input_text),
234
  "Tone": extract_tone(input_text),
235
  "Hashtags": extract_hashtags(input_text),
236
- "Frames": frames_table, # Markdown table displaying frame categories
237
  }
238
 
239
  if uploaded_docx:
@@ -253,7 +225,6 @@ if uploaded_excel:
253
  excel_metadata = extract_metadata_from_excel(uploaded_excel)
254
  output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
255
 
256
- # Display results in collapsible sections
257
  if output_data:
258
  for post_number, data in output_data.items():
259
  with st.expander(post_number):
@@ -263,7 +234,6 @@ if output_data:
263
  else:
264
  st.write(f"**{key}:** {value}")
265
 
266
- # Generate DOCX output for download
267
  if output_data:
268
  docx_output = create_docx_from_data(output_data)
269
  docx_io = io.BytesIO()
 
65
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
66
  }
67
 
68
+ # Initialize zero-shot classifier for qualitative frame categorization
69
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
70
+ candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
71
+
72
  def detect_language(text):
73
  try:
74
  return detect(text)
 
76
  logging.error(f"Error detecting language: {e}")
77
  return "unknown"
78
 
 
79
  def extract_tone(text):
80
  try:
81
  response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
 
85
  logging.error(f"Groq API error: {e}")
86
  return extract_tone_fallback(text)
87
 
 
88
  def extract_tone_fallback(text):
89
  detected_tones = set()
90
  text_lower = text.lower()
 
93
  detected_tones.add(category)
94
  return list(detected_tones) if detected_tones else ["Neutral"]
95
 
 
96
  def extract_hashtags(text):
97
  return re.findall(r"#\w+", text)
98
 
99
  # -------------------------------------------------------------------
100
+ # New functions for qualitative frame categorization using zero-shot classification
101
  # -------------------------------------------------------------------
102
 
103
  def get_frame_category_mapping(text):
104
  """
105
+ For each frame category defined in frame_categories, this function uses a zero-shot classification
106
+ approach to qualitatively assess how strongly the text discusses the frame. The classifier returns one of:
107
+ "Major Focus", "Significant Focus", "Minor Mention", or "Not Applicable".
 
 
 
108
  """
109
+ mapping = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  for frame in frame_categories.keys():
111
+ hypothesis_template = f"This text is {{}} about {frame}."
112
+ result = classifier(text, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template)
113
+ best_label = result["labels"][0] # select the highest scoring label
114
+ mapping[frame] = best_label
115
+ return mapping
116
 
117
  def format_frame_categories_table(mapping):
118
  """
 
136
  # Existing functions for file processing
137
  # -------------------------------------------------------------------
138
 
 
139
  def extract_captions_from_docx(docx_file):
140
  doc = Document(docx_file)
141
  captions = {}
 
149
  captions[current_post].append(text)
150
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
151
 
 
152
  def extract_metadata_from_excel(excel_file):
153
  try:
154
  df = pd.read_excel(excel_file)
 
158
  logging.error(f"Error processing Excel file: {e}")
159
  return []
160
 
 
161
  def merge_metadata_with_generated_data(generated_data, excel_metadata):
162
  for post_data in excel_metadata:
163
  post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
 
167
  generated_data[post_number] = post_data
168
  return generated_data
169
 
 
170
  def create_docx_from_data(extracted_data):
171
  doc = Document()
172
  for post_number, data in extracted_data.items():
 
180
  value = data.get(key, "N/A")
181
  if key in ["Tone", "Hashtags"]:
182
  value = ", ".join(value) if isinstance(value, list) else value
 
183
  doc.add_paragraph(f"**{key}:** {value}")
184
  doc.add_paragraph("\n")
185
  return doc
 
198
  output_data = {}
199
 
200
  if input_text:
 
201
  frame_mapping = get_frame_category_mapping(input_text)
202
  frames_table = format_frame_categories_table(frame_mapping)
203
  output_data["Manual Input"] = {
 
205
  "Language": detect_language(input_text),
206
  "Tone": extract_tone(input_text),
207
  "Hashtags": extract_hashtags(input_text),
208
+ "Frames": frames_table,
209
  }
210
 
211
  if uploaded_docx:
 
225
  excel_metadata = extract_metadata_from_excel(uploaded_excel)
226
  output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
227
 
 
228
  if output_data:
229
  for post_number, data in output_data.items():
230
  with st.expander(post_number):
 
234
  else:
235
  st.write(f"**{key}:** {value}")
236
 
 
237
  if output_data:
238
  docx_output = create_docx_from_data(output_data)
239
  docx_io = io.BytesIO()