ahm14 commited on
Commit
e465fa1
·
verified ·
1 Parent(s): 74033b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -19
app.py CHANGED
@@ -5,6 +5,7 @@ import re
5
  import logging
6
  import nltk
7
  from docx import Document
 
8
  import io
9
  from langdetect import detect
10
  from collections import Counter
@@ -46,7 +47,7 @@ tone_categories = {
46
  "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
47
  }
48
 
49
- # Frame categories for fallback method
50
  frame_categories = {
51
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
52
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -65,8 +66,15 @@ frame_categories = {
65
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
66
  }
67
 
68
- # Initialize zero-shot classifier for qualitative frame categorization
69
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 
 
 
 
 
 
 
70
  candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
71
 
72
  def detect_language(text):
@@ -78,8 +86,10 @@ def detect_language(text):
78
 
79
  def extract_tone(text):
80
  try:
81
- response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
82
- {"role": "user", "content": text}])
 
 
83
  return response["choices"][0]["message"]["content"].split(", ")
84
  except Exception as e:
85
  logging.error(f"Groq API error: {e}")
@@ -97,33 +107,42 @@ def extract_hashtags(text):
97
  return re.findall(r"#\w+", text)
98
 
99
  # -------------------------------------------------------------------
100
- # New functions for qualitative frame categorization using zero-shot classification
101
  # -------------------------------------------------------------------
102
 
103
  def get_frame_category_mapping(text):
104
  """
105
- For each frame category defined in frame_categories, this function uses a zero-shot classification
106
- approach to qualitatively assess how strongly the text discusses the frame. The classifier returns one of:
107
- "Major Focus", "Significant Focus", "Minor Mention", or "Not Applicable".
108
  """
 
 
 
 
 
 
 
 
 
 
 
 
109
  mapping = {}
110
- for frame in frame_categories.keys():
111
- hypothesis_template = f"This text is {{}} about {frame}."
112
- result = classifier(text, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template)
113
- best_label = result["labels"][0] # select the highest scoring label
114
- mapping[frame] = best_label
115
  return mapping
116
 
117
  def format_frame_categories_table(mapping):
118
  """
119
- Returns a markdown-formatted table that displays each frame along with four columns:
120
  Major Focus, Significant Focus, Minor Mention, and Not Applicable.
121
- A tick (✓) is shown only in the column corresponding to the assigned category.
122
  """
123
  header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
124
  header += "| --- | --- | --- | --- | --- |\n"
125
- rows = ""
126
  tick = "✓"
 
127
  for frame, category in mapping.items():
128
  major = tick if category == "Major Focus" else ""
129
  significant = tick if category == "Significant Focus" else ""
@@ -174,13 +193,38 @@ def create_docx_from_data(extracted_data):
174
  ordered_keys = [
175
  "Post Number", "Date of Post", "Media Type", "Number of Pictures",
176
  "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
177
- "Full Caption", "Language", "Tone", "Hashtags", "Frames"
178
  ]
179
  for key in ordered_keys:
180
  value = data.get(key, "N/A")
181
  if key in ["Tone", "Hashtags"]:
182
  value = ", ".join(value) if isinstance(value, list) else value
183
- doc.add_paragraph(f"**{key}:** {value}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  doc.add_paragraph("\n")
185
  return doc
186
 
@@ -206,6 +250,7 @@ if input_text:
206
  "Tone": extract_tone(input_text),
207
  "Hashtags": extract_hashtags(input_text),
208
  "Frames": frames_table,
 
209
  }
210
 
211
  if uploaded_docx:
@@ -219,6 +264,7 @@ if uploaded_docx:
219
  "Tone": extract_tone(text),
220
  "Hashtags": extract_hashtags(text),
221
  "Frames": frames_table,
 
222
  }
223
 
224
  if uploaded_excel:
 
5
  import logging
6
  import nltk
7
  from docx import Document
8
+ from docx.shared import Pt
9
  import io
10
  from langdetect import detect
11
  from collections import Counter
 
47
  "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
48
  }
49
 
50
+ # Frame categories for qualitative analysis
51
  frame_categories = {
52
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
53
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
 
66
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
67
  }
68
 
69
+ # Initialize zero-shot classifier with a distilled model, GPU and half precision
70
+ classifier = pipeline(
71
+ "zero-shot-classification",
72
+ model="valhalla/distilbart-mnli-12-3", # distilled and faster model
73
+ device=0, # use GPU if available
74
+ torch_dtype="auto", # or use torch.float16 if you want to force fp16
75
+ batch_size=16 # adjust batch_size based on your GPU memory
76
+ )
77
+ # Candidate labels for qualitative categorization (order matters only for display)
78
  candidate_labels = ["Major Focus", "Significant Focus", "Minor Mention", "Not Applicable"]
79
 
80
  def detect_language(text):
 
86
 
87
  def extract_tone(text):
88
  try:
89
+ response = llm.chat([
90
+ {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
91
+ {"role": "user", "content": text}
92
+ ])
93
  return response["choices"][0]["message"]["content"].split(", ")
94
  except Exception as e:
95
  logging.error(f"Groq API error: {e}")
 
107
  return re.findall(r"#\w+", text)
108
 
109
  # -------------------------------------------------------------------
110
+ # New functions for qualitative frame categorization (batched version)
111
  # -------------------------------------------------------------------
112
 
113
  def get_frame_category_mapping(text):
114
  """
115
+ For each frame defined in frame_categories, this function uses a zero-shot classification
116
+ approach to assess qualitatively how strongly the text discusses the frame.
117
+ It builds a batched list of hypothesis templates and returns a mapping from frame to the best label.
118
  """
119
+ frames = list(frame_categories.keys())
120
+ # Build hypothesis templates—one per frame:
121
+ hypothesis_templates = [f"This text is {{}} about {frame}." for frame in frames]
122
+ # Repeat the same input text for each frame in the batch.
123
+ texts = [text] * len(frames)
124
+ # Batch call: each element of the batch uses its own hypothesis_template.
125
+ results = classifier(
126
+ texts,
127
+ candidate_labels=candidate_labels,
128
+ hypothesis_template=hypothesis_templates,
129
+ batch_size=len(frames) # process all at once
130
+ )
131
  mapping = {}
132
+ for frame, result in zip(frames, results):
133
+ mapping[frame] = result["labels"][0]
 
 
 
134
  return mapping
135
 
136
  def format_frame_categories_table(mapping):
137
  """
138
+ Returns a markdown-formatted table displaying each frame with columns:
139
  Major Focus, Significant Focus, Minor Mention, and Not Applicable.
140
+ A tick (✓) marks the assigned category.
141
  """
142
  header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
143
  header += "| --- | --- | --- | --- | --- |\n"
 
144
  tick = "✓"
145
+ rows = ""
146
  for frame, category in mapping.items():
147
  major = tick if category == "Major Focus" else ""
148
  significant = tick if category == "Significant Focus" else ""
 
193
  ordered_keys = [
194
  "Post Number", "Date of Post", "Media Type", "Number of Pictures",
195
  "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
196
+ "Full Caption", "Language", "Tone", "Hashtags"
197
  ]
198
  for key in ordered_keys:
199
  value = data.get(key, "N/A")
200
  if key in ["Tone", "Hashtags"]:
201
  value = ", ".join(value) if isinstance(value, list) else value
202
+ para = doc.add_paragraph()
203
+ run = para.add_run(f"**{key}:** {value}")
204
+ run.font.size = Pt(11)
205
+ # Add a proper table for Frames if a mapping is available.
206
+ if "FramesMapping" in data:
207
+ doc.add_paragraph("Frames:")
208
+ mapping = data["FramesMapping"]
209
+ table = doc.add_table(rows=1, cols=5)
210
+ table.style = "Light List Accent 1"
211
+ hdr_cells = table.rows[0].cells
212
+ hdr_cells[0].text = "Frame"
213
+ hdr_cells[1].text = "Major Focus"
214
+ hdr_cells[2].text = "Significant Focus"
215
+ hdr_cells[3].text = "Minor Mention"
216
+ hdr_cells[4].text = "Not Applicable"
217
+ tick = "✓"
218
+ for frame, category in mapping.items():
219
+ row_cells = table.add_row().cells
220
+ row_cells[0].text = frame
221
+ row_cells[1].text = tick if category == "Major Focus" else ""
222
+ row_cells[2].text = tick if category == "Significant Focus" else ""
223
+ row_cells[3].text = tick if category == "Minor Mention" else ""
224
+ row_cells[4].text = tick if category == "Not Applicable" else ""
225
+ else:
226
+ value = data.get("Frames", "N/A")
227
+ doc.add_paragraph(f"**Frames:** {value}")
228
  doc.add_paragraph("\n")
229
  return doc
230
 
 
250
  "Tone": extract_tone(input_text),
251
  "Hashtags": extract_hashtags(input_text),
252
  "Frames": frames_table,
253
+ "FramesMapping": frame_mapping
254
  }
255
 
256
  if uploaded_docx:
 
264
  "Tone": extract_tone(text),
265
  "Hashtags": extract_hashtags(text),
266
  "Frames": frames_table,
267
+ "FramesMapping": frame_mapping
268
  }
269
 
270
  if uploaded_excel: