ManthaBhuvana commited on
Commit
bec29d2
Β·
verified Β·
1 Parent(s): 1d94a82

Upload 3 files

Browse files
Files changed (3) hide show
  1. mcq_generator.py +1291 -0
  2. mcq_gradio_app.py +85 -0
  3. requirements.txt +6 -0
mcq_generator.py ADDED
@@ -0,0 +1,1291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Yet another copy of MCQ, Toxic, Bias.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_4-bS633DBVMc5-jBLCmyUaXzAi5RL6f
8
+
9
+ #MCQ Generation Using T5
10
+ """
11
+
12
+ # Improved MCQ Generator using T5 Model
13
+ import torch
14
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer,pipeline
15
+ import nltk
16
+ import random
17
+ from nltk.tokenize import sent_tokenize, word_tokenize
18
+ from nltk.corpus import stopwords
19
+ from nltk.tag import pos_tag
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ import numpy as np
23
+ import re
24
+ import string
25
+
26
+ # Download required NLTK packages
27
+ nltk.download('punkt')
28
+ nltk.download('averaged_perceptron_tagger_eng')
29
+ nltk.download('wordnet')
30
+ nltk.download('stopwords')
31
+ nltk.download('punkt_tab')
32
+
33
+ # Load Safety Models
34
+ toxicity_model = pipeline("text-classification", model="unitary/toxic-bert")
35
+ bias_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
36
+
37
+ # Enhanced Safety check function with comprehensive bias detection
38
+ def is_suitable_for_students(text):
39
+ """Comprehensive content check for appropriateness in educational settings"""
40
+ text = text.strip()
41
+ if not text:
42
+ print("⚠️ Empty paragraph provided.")
43
+ return False
44
+
45
+ # Check for text length
46
+ if len(text.split()) < 20:
47
+ print("⚠️ Text too short for meaningful MCQ generation.")
48
+ return False
49
+
50
+ # Check Toxicity
51
+ toxicity = toxicity_model(text[:512])[0]
52
+ tox_label, tox_score = toxicity['label'].lower(), toxicity['score']
53
+
54
+ # COMPREHENSIVE BIAS DETECTION
55
+
56
+ # 1. Check for gender bias
57
+ gender_bias_keywords = [
58
+ "women are", "men are", "boys are", "girls are",
59
+ "females are", "males are", "better at", "worse at",
60
+ "naturally better", "suited for", "belong in",
61
+ "should be", "can't do", "always", "never"
62
+ ]
63
+
64
+ # 2. Check for racial bias
65
+ racial_bias_keywords = [
66
+ "race", "racial", "racist", "ethnicity", "ethnic",
67
+ "black people", "white people", "asian people", "latinos",
68
+ "minorities", "majority", "immigrants", "foreigners"
69
+ ]
70
+
71
+ # 3. Check for political bias
72
+ political_bias_keywords = [
73
+ "liberal", "conservative", "democrat", "republican",
74
+ "left-wing", "right-wing", "socialism", "capitalism",
75
+ "government", "politician", "corrupt", "freedom", "rights",
76
+ "policy", "policies", "taxes", "taxation"
77
+ ]
78
+
79
+ # 4. Check for religious bias
80
+ religious_bias_keywords = [
81
+ "christian", "muslim", "jewish", "hindu", "buddhist",
82
+ "atheist", "religion", "religious", "faith", "belief",
83
+ "worship", "sacred", "holy"
84
+ ]
85
+
86
+ # 5. Check for socioeconomic bias
87
+ socioeconomic_bias_keywords = [
88
+ "poor", "rich", "wealthy", "poverty", "privileged",
89
+ "underprivileged", "class", "elite", "welfare", "lazy",
90
+ "hardworking", "deserve", "entitled"
91
+ ]
92
+
93
+ # Combined bias keywords
94
+ all_bias_keywords = (gender_bias_keywords + racial_bias_keywords +
95
+ political_bias_keywords + religious_bias_keywords +
96
+ socioeconomic_bias_keywords)
97
+
98
+ # Additional problematic generalizations
99
+ problematic_phrases = [
100
+ "more aggressive", "less educated", "less intelligent", "more violent",
101
+ "inferior", "superior", "better", "smarter", "worse", "dumber",
102
+ "tend to be more", "tend to be less", "are naturally", "by nature",
103
+ "all people", "those people", "these people", "that group",
104
+ "always", "never", "inherently", "genetically"
105
+ ]
106
+
107
+ # Check if any bias keywords are present
108
+ contains_bias_keywords = any(keyword in text.lower() for keyword in all_bias_keywords)
109
+ contains_problematic_phrases = any(phrase in text.lower() for phrase in problematic_phrases)
110
+
111
+ # Advanced bias detection using BART model
112
+ # Use both general and specific bias detection sets
113
+ general_bias_labels = ["neutral", "biased", "discriminatory", "prejudiced", "stereotyping"]
114
+ gender_bias_labels = ["gender neutral", "gender biased", "sexist"]
115
+ racial_bias_labels = ["racially neutral", "racially biased", "racist"]
116
+ political_bias_labels = ["politically neutral", "politically biased", "partisan"]
117
+
118
+ # Run general bias detection first
119
+ bias_result = bias_model(text[:512], candidate_labels=general_bias_labels)
120
+ bias_label = bias_result['labels'][0].lower()
121
+ bias_score = bias_result['scores'][0]
122
+
123
+ # If general check is uncertain, run more specific checks
124
+ if bias_score < 0.7 and contains_bias_keywords:
125
+ # Determine which specific bias check to run
126
+ if any(keyword in text.lower() for keyword in gender_bias_keywords):
127
+ specific_result = bias_model(text[:512], candidate_labels=gender_bias_labels)
128
+ if specific_result['labels'][0] != gender_bias_labels[0] and specific_result['scores'][0] > 0.6:
129
+ bias_label = "gender biased"
130
+ bias_score = specific_result['scores'][0]
131
+
132
+ if any(keyword in text.lower() for keyword in racial_bias_keywords):
133
+ specific_result = bias_model(text[:512], candidate_labels=racial_bias_labels)
134
+ if specific_result['labels'][0] != racial_bias_labels[0] and specific_result['scores'][0] > 0.6:
135
+ bias_label = "racially biased"
136
+ bias_score = specific_result['scores'][0]
137
+
138
+ if any(keyword in text.lower() for keyword in political_bias_keywords):
139
+ specific_result = bias_model(text[:512], candidate_labels=political_bias_labels)
140
+ if specific_result['labels'][0] != political_bias_labels[0] and specific_result['scores'][0] > 0.6:
141
+ bias_label = "politically biased"
142
+ bias_score = specific_result['scores'][0]
143
+
144
+ # Set appropriate thresholds
145
+ bias_threshold = 0.55 # Lower to catch more subtle bias
146
+ toxicity_threshold = 0.60
147
+
148
+ # Decision logic with detailed reporting
149
+ if tox_label == "toxic" and tox_score > toxicity_threshold:
150
+ print(f"⚠️ Toxicity Detected ({tox_score:.2f}) β€” ❌ Not Suitable for Students")
151
+ return False
152
+ elif bias_label in ["biased", "discriminatory", "prejudiced", "stereotyping",
153
+ "gender biased", "racially biased", "politically biased"] and bias_score > bias_threshold:
154
+ print(f"⚠️ {bias_label.title()} Content Detected ({bias_score:.2f}) β€” ❌ Not Suitable for Students")
155
+ return False
156
+ elif contains_problematic_phrases:
157
+ print(f"⚠️ Problematic Generalizations Detected β€” ❌ Not Suitable for Students")
158
+ return False
159
+ else:
160
+ print(f"βœ… Passed Safety Check β€” 🟒 Proceeding to Generate MCQs")
161
+ return True
162
+
163
+ class ImprovedMCQGenerator:
164
+ def __init__(self):
165
+ # Initialize QG-specific model for better question generation
166
+ self.qg_model_name = "lmqg/t5-base-squad-qg" # Specialized question generation model
167
+ try:
168
+ self.qg_tokenizer = AutoTokenizer.from_pretrained(self.qg_model_name)
169
+ self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(self.qg_model_name)
170
+ self.has_qg_model = True
171
+ except:
172
+ # Fall back to T5 if specialized model fails to load
173
+ self.has_qg_model = False
174
+ print("Could not load specialized QG model, falling back to T5")
175
+
176
+ # Initialize T5 model for distractors and fallback question generation
177
+ self.t5_model_name = "google/flan-t5-base" # Using base model for better quality
178
+ self.t5_tokenizer = T5Tokenizer.from_pretrained(self.t5_model_name)
179
+ self.t5_model = T5ForConditionalGeneration.from_pretrained(self.t5_model_name)
180
+
181
+ # Configuration
182
+ self.max_length = 128
183
+ self.stop_words = set(stopwords.words('english'))
184
+
185
+ def clean_text(self, text):
186
+ """Clean and normalize text"""
187
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
188
+ text = text.strip()
189
+ return text
190
+
191
+ def generate_question(self, context, answer):
192
+ """Generate a question given a context and answer using specialized QG model"""
193
+ # Find the sentence containing the answer for better context
194
+ sentences = sent_tokenize(context)
195
+ relevant_sentences = []
196
+
197
+ for sentence in sentences:
198
+ if answer.lower() in sentence.lower():
199
+ relevant_sentences.append(sentence)
200
+
201
+ if not relevant_sentences:
202
+ # If answer not found in any sentence, use a random sentence
203
+ if sentences:
204
+ relevant_sentences = [random.choice(sentences)]
205
+ else:
206
+ relevant_sentences = [context]
207
+
208
+ # Use up to 3 sentences for context (the sentence with answer + neighbors)
209
+ if len(relevant_sentences) == 1 and len(sentences) > 1:
210
+ # Find the index of the relevant sentence
211
+ idx = sentences.index(relevant_sentences[0])
212
+ if idx > 0:
213
+ relevant_sentences.append(sentences[idx-1])
214
+ if idx < len(sentences) - 1:
215
+ relevant_sentences.append(sentences[idx+1])
216
+
217
+ # Join the relevant sentences
218
+ focused_context = ' '.join(relevant_sentences)
219
+
220
+ if self.has_qg_model:
221
+ # Use specialized QG model
222
+ input_text = f"answer: {answer} context: {focused_context}"
223
+ inputs = self.qg_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
224
+
225
+ outputs = self.qg_model.generate(
226
+ input_ids=inputs["input_ids"],
227
+ attention_mask=inputs["attention_mask"],
228
+ max_length=self.max_length,
229
+ num_beams=5,
230
+ top_k=120,
231
+ top_p=0.95,
232
+ temperature=1.0,
233
+ do_sample=True,
234
+ num_return_sequences=3,
235
+ no_repeat_ngram_size=2
236
+ )
237
+
238
+ # Get multiple questions and pick the best one
239
+ questions = [self.qg_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
240
+ valid_questions = [q for q in questions if q.endswith('?') and answer.lower() not in q.lower()]
241
+
242
+ if valid_questions:
243
+ return self.clean_text(valid_questions[0])
244
+
245
+ # Fallback to T5 model if specialized model fails or isn't available
246
+ input_text = f"generate question for answer: {answer} from context: {focused_context}"
247
+ inputs = self.t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
248
+
249
+ outputs = self.t5_model.generate(
250
+ input_ids=inputs["input_ids"],
251
+ attention_mask=inputs["attention_mask"],
252
+ max_length=self.max_length,
253
+ num_beams=5,
254
+ top_k=120,
255
+ top_p=0.95,
256
+ temperature=1.0,
257
+ do_sample=True,
258
+ num_return_sequences=3,
259
+ no_repeat_ngram_size=2
260
+ )
261
+
262
+ questions = [self.t5_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
263
+
264
+ # Clean and validate questions
265
+ valid_questions = []
266
+ for q in questions:
267
+ # Format the question properly
268
+ q = self.clean_text(q)
269
+ if not q.endswith('?'):
270
+ q += '?'
271
+
272
+ # Avoid questions that contain the answer directly
273
+ if answer.lower() not in q.lower():
274
+ valid_questions.append(q)
275
+
276
+ if valid_questions:
277
+ return valid_questions[0]
278
+
279
+ # If all else fails, create a simple question
280
+ return f"Which of the following best describes {answer}?"
281
+
282
+ def extract_key_entities(self, text, n=8):
283
+ """Extract key entities from text that would make good answers"""
284
+ # Tokenize and get POS tags
285
+ sentences = sent_tokenize(text)
286
+
287
+ # Get noun phrases and named entities
288
+ key_entities = []
289
+
290
+ for sentence in sentences:
291
+ words = word_tokenize(sentence)
292
+ pos_tags = pos_tag(words)
293
+
294
+ # Extract noun phrases (consecutive nouns and adjectives)
295
+ i = 0
296
+ while i < len(pos_tags):
297
+ if pos_tags[i][1].startswith('NN') or pos_tags[i][1].startswith('JJ'):
298
+ phrase = pos_tags[i][0]
299
+ j = i + 1
300
+ while j < len(pos_tags) and (pos_tags[j][1].startswith('NN') or pos_tags[j][1] == 'JJ'):
301
+ phrase += ' ' + pos_tags[j][0]
302
+ j += 1
303
+ if len(phrase.split()) >= 1 and not all(w.lower() in self.stop_words for w in phrase.split()):
304
+ key_entities.append(phrase)
305
+ i = j
306
+ else:
307
+ i += 1
308
+
309
+ # Extract important terms based on POS tags
310
+ important_terms = []
311
+ for sentence in sentences:
312
+ words = word_tokenize(sentence)
313
+ pos_tags = pos_tag(words)
314
+
315
+ # Get nouns, verbs, and adjectives
316
+ terms = [word for word, pos in pos_tags if
317
+ (pos.startswith('NN') or pos.startswith('VB') or pos.startswith('JJ'))
318
+ and word.lower() not in self.stop_words
319
+ and len(word) > 2]
320
+
321
+ important_terms.extend(terms)
322
+
323
+ # Combine and remove duplicates
324
+ all_candidates = key_entities + important_terms
325
+ unique_candidates = []
326
+
327
+ for candidate in all_candidates:
328
+ # Clean candidate
329
+ candidate = candidate.strip()
330
+ candidate = re.sub(r'[^\w\s]', '', candidate)
331
+
332
+ # Skip if empty or just stopwords
333
+ if not candidate or all(w.lower() in self.stop_words for w in candidate.split()):
334
+ continue
335
+
336
+ # Check for duplicates
337
+ if candidate.lower() not in [c.lower() for c in unique_candidates]:
338
+ unique_candidates.append(candidate)
339
+
340
+ # Use TF-IDF to rank entities by importance
341
+ if len(unique_candidates) > n:
342
+ try:
343
+ vectorizer = TfidfVectorizer()
344
+ tfidf_matrix = vectorizer.fit_transform([text] + unique_candidates)
345
+ document_vector = tfidf_matrix[0:1]
346
+ entity_vectors = tfidf_matrix[1:]
347
+
348
+ # Calculate similarity to document
349
+ similarities = cosine_similarity(document_vector, entity_vectors).flatten()
350
+
351
+ # Get top n entities
352
+ ranked_entities = [entity for _, entity in sorted(zip(similarities, unique_candidates), reverse=True)]
353
+ return ranked_entities[:n]
354
+ except:
355
+ # Fallback if TF-IDF fails
356
+ return random.sample(unique_candidates, min(n, len(unique_candidates)))
357
+
358
+ return unique_candidates[:n]
359
+
360
+ def generate_distractors(self, answer, context, n=3):
361
+ """Generate plausible distractors for a given answer"""
362
+ # Extract potential distractors from context
363
+ potential_distractors = self.extract_key_entities(context, n=15)
364
+
365
+ # Remove the correct answer and similar options
366
+ filtered_distractors = []
367
+ answer_lower = answer.lower()
368
+
369
+ for distractor in potential_distractors:
370
+ distractor_lower = distractor.lower()
371
+
372
+ # Skip if it's the answer or too similar to the answer
373
+ if distractor_lower == answer_lower:
374
+ continue
375
+ if answer_lower in distractor_lower or distractor_lower in answer_lower:
376
+ continue
377
+ if len(set(distractor_lower.split()) & set(answer_lower.split())) > len(answer_lower.split()) / 2:
378
+ continue
379
+
380
+ filtered_distractors.append(distractor)
381
+
382
+ # If we need more distractors, generate them with T5
383
+ if len(filtered_distractors) < n:
384
+ input_text = f"generate alternatives for: {answer} context: {context}"
385
+ inputs = self.t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
386
+
387
+ outputs = self.t5_model.generate(
388
+ input_ids=inputs["input_ids"],
389
+ attention_mask=inputs["attention_mask"],
390
+ max_length=64,
391
+ num_beams=5,
392
+ top_k=50,
393
+ top_p=0.95,
394
+ temperature=1.2,
395
+ do_sample=True,
396
+ num_return_sequences=5
397
+ )
398
+
399
+ model_distractors = [self.t5_tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
400
+
401
+ # Clean and validate model distractors
402
+ for distractor in model_distractors:
403
+ distractor = self.clean_text(distractor)
404
+
405
+ # Skip if it's the answer or too similar
406
+ if distractor.lower() == answer.lower():
407
+ continue
408
+ if answer.lower() in distractor.lower() or distractor.lower() in answer.lower():
409
+ continue
410
+
411
+ filtered_distractors.append(distractor)
412
+
413
+ # Ensure uniqueness
414
+ unique_distractors = []
415
+ for d in filtered_distractors:
416
+ if d.lower() not in [x.lower() for x in unique_distractors]:
417
+ unique_distractors.append(d)
418
+
419
+ # If we still don't have enough, create semantic variations
420
+ while len(unique_distractors) < n:
421
+ if not unique_distractors and not potential_distractors:
422
+ # No existing distractors to work with, create something different
423
+ unique_distractors.append(f"None of the above")
424
+ unique_distractors.append(f"All of the above")
425
+ unique_distractors.append(f"Not mentioned in the text")
426
+ else:
427
+ base = answer if not unique_distractors else random.choice(unique_distractors)
428
+ words = base.split()
429
+
430
+ if len(words) > 1:
431
+ # Modify a multi-word distractor
432
+ modified = words.copy()
433
+ pos_to_change = random.randint(0, len(words)-1)
434
+
435
+ # Make sure the new distractor is different
436
+ modification = f"alternative_{modified[pos_to_change]}"
437
+ while modification in [x.lower() for x in unique_distractors]:
438
+ modification += "_variant"
439
+
440
+ modified[pos_to_change] = modification
441
+ unique_distractors.append(" ".join(modified))
442
+ else:
443
+ # Modify a single word
444
+ modification = f"alternative_{base}"
445
+ while modification in [x.lower() for x in unique_distractors]:
446
+ modification += "_variant"
447
+
448
+ unique_distractors.append(modification)
449
+
450
+ # Return the required number of distractors
451
+ return unique_distractors[:n]
452
+
453
+ def validate_mcq(self, mcq, context):
454
+ """Validate if an MCQ meets quality standards"""
455
+ # Check if question ends with question mark
456
+ if not mcq['question'].endswith('?'):
457
+ return False
458
+
459
+ # Check if the question is too short
460
+ if len(mcq['question'].split()) < 5:
461
+ return False
462
+
463
+ # Check if question contains the answer (too obvious)
464
+ if mcq['answer'].lower() in mcq['question'].lower():
465
+ return False
466
+
467
+ # Check if options are sufficiently different
468
+ if len(set([o.lower() for o in mcq['options']])) < len(mcq['options']):
469
+ return False
470
+
471
+ # Check if answer is in the context
472
+ if mcq['answer'].lower() not in context.lower():
473
+ return False
474
+
475
+ return True
476
+
477
+ def generate_mcqs(self, paragraph, num_questions=5):
478
+ """Generate multiple-choice questions from a paragraph"""
479
+ paragraph = self.clean_text(paragraph)
480
+ mcqs = []
481
+
482
+ # Extract potential answers
483
+ potential_answers = self.extract_key_entities(paragraph, n=num_questions*3)
484
+
485
+ # Shuffle potential answers
486
+ random.shuffle(potential_answers)
487
+
488
+ # Try to generate MCQs for each potential answer
489
+ attempts = 0
490
+ max_attempts = num_questions * 3 # Try more potential answers than needed
491
+
492
+ while len(mcqs) < num_questions and attempts < max_attempts and potential_answers:
493
+ answer = potential_answers.pop(0)
494
+ attempts += 1
495
+
496
+ # Generate question
497
+ question = self.generate_question(paragraph, answer)
498
+
499
+ # Generate distractors
500
+ distractors = self.generate_distractors(answer, paragraph)
501
+
502
+ # Create MCQ
503
+ mcq = {
504
+ 'question': question,
505
+ 'options': [answer] + distractors,
506
+ 'answer': answer
507
+ }
508
+
509
+ # Validate MCQ
510
+ if self.validate_mcq(mcq, paragraph):
511
+ # Shuffle options
512
+ shuffled_options = mcq['options'].copy()
513
+ random.shuffle(shuffled_options)
514
+
515
+ # Find the index of the correct answer
516
+ correct_index = shuffled_options.index(answer)
517
+
518
+ # Update MCQ with shuffled options
519
+ mcq['options'] = shuffled_options
520
+ mcq['answer_index'] = correct_index
521
+
522
+ mcqs.append(mcq)
523
+
524
+ return mcqs[:num_questions]
525
+
526
+ # Helper functions
527
+ def format_mcq(mcq, index):
528
+ """Format MCQ for display"""
529
+ question = f"Q{index+1}: {mcq['question']}"
530
+ options = [f" {chr(65+i)}. {option}" for i, option in enumerate(mcq['options'])]
531
+ answer = f"Answer: {chr(65+mcq['answer_index'])}"
532
+ return "\n".join([question] + options + [answer, ""])
533
+
534
+ def generate_mcqs_from_paragraph(paragraph, num_questions=5):
535
+ """Generate and format MCQs from a paragraph"""
536
+ generator = ImprovedMCQGenerator()
537
+ mcqs = generator.generate_mcqs(paragraph, num_questions)
538
+
539
+ formatted_mcqs = []
540
+ for i, mcq in enumerate(mcqs):
541
+ formatted_mcqs.append(format_mcq(mcq, i))
542
+
543
+ return "\n".join(formatted_mcqs)
544
+
545
+ # Example paragraphs
546
+ example_paragraphs = [
547
+ """
548
+ The cell is the basic structural and functional unit of all living organisms. Cells can be classified into two main types: prokaryotic and eukaryotic.
549
+ Prokaryotic cells, found in bacteria and archaea, lack a defined nucleus and membrane-bound organelles. In contrast, eukaryotic cells, which make up plants,
550
+ animals, fungi, and protists, contain a nucleus that houses the cell’s DNA, as well as various organelles like mitochondria and the endoplasmic reticulum.
551
+ The cell membrane regulates the movement of substances in and out of the cell, while the cytoplasm supports the internal structures.
552
+ """,
553
+
554
+ """
555
+ The Industrial Revolution was a major historical transformation that began in Great Britain in the late 18th century. It marked the shift from manual labor and
556
+ hand-made goods to machine-based manufacturing and mass production. This shift significantly increased productivity and efficiency. The textile industry was the
557
+ first to implement modern industrial methods, including the use of spinning machines and mechanized looms. A key innovation during this period was the development
558
+ of steam power, notably improved by Scottish engineer James Watt. Steam engines enabled factories to operate away from rivers, which had previously been the main
559
+ power source. Additional advancements included the invention of machine tools and the emergence of large-scale factory systems. These changes revolutionized industrial
560
+ labor and contributed to the rise of new social classes, including the industrial working class and the capitalist class. The Industrial Revolution also led to rapid
561
+ urbanization, a sharp rise in population, and eventually, improvements in living standards and economic growth.
562
+ """
563
+ ]
564
+
565
+ # Main execution
566
+ if __name__ == "__main__":
567
+ print("MCQ Generator - Testing with Example Paragraphs")
568
+ print("=" * 80)
569
+
570
+ for i, paragraph in enumerate(example_paragraphs):
571
+ print(f"\nExample {i + 1}:")
572
+ print("-" * 40)
573
+
574
+ if is_suitable_for_students(paragraph):
575
+ print(generate_mcqs_from_paragraph(paragraph))
576
+ else:
577
+ print("❌ Content not suitable for MCQ generation. Please provide different content.")
578
+
579
+ print("=" * 80)
580
+
581
+ # Interactive mode
582
+ print("\n--- MCQ Generator ---")
583
+ print("Enter a paragraph to generate MCQs (or type 'exit' to quit):")
584
+ while True:
585
+ user_input = input("> ")
586
+ if user_input.lower() == 'exit':
587
+ break
588
+ if is_suitable_for_students(user_input):
589
+ print(generate_mcqs_from_paragraph(user_input))
590
+ else:
591
+ print("❌ Content not suitable for MCQ generation. Please provide different content.")
592
+
593
+ """#Performance Metrics
594
+
595
+ """
596
+
597
+
598
+ import time
599
+ import psutil
600
+ import numpy as np
601
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
602
+ from rouge import Rouge
603
+ import matplotlib.pyplot as plt
604
+ from IPython.display import display
605
+ import pandas as pd
606
+ from nltk.tokenize import sent_tokenize
607
+ import tracemalloc
608
+ import gc
609
+ import re
610
+ import random
611
+ import warnings
612
+ from sklearn.metrics.pairwise import cosine_similarity
613
+ from sklearn.feature_extraction.text import TfidfVectorizer
614
+
615
+ class MCQPerformanceMetrics:
616
+ def __init__(self, mcq_generator):
617
+ """Initialize the performance metrics class with the MCQ generator"""
618
+ self.mcq_generator = mcq_generator
619
+ self.rouge = Rouge()
620
+ # Initialize NLTK smoothing function to handle zero counts
621
+ self.smoothing = SmoothingFunction().method1
622
+ # For semantic similarity
623
+ self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
624
+
625
+ def measure_execution_time(self, paragraphs, num_questions=5, repetitions=3):
626
+ """Measure execution time for generating MCQs"""
627
+ execution_times = []
628
+ questions_per_second = []
629
+
630
+ for paragraph in paragraphs:
631
+ paragraph_times = []
632
+ for _ in range(repetitions):
633
+ start_time = time.time()
634
+ mcqs = self.mcq_generator.generate_mcqs(paragraph, num_questions)
635
+ end_time = time.time()
636
+
637
+ execution_time = end_time - start_time
638
+ paragraph_times.append(execution_time)
639
+
640
+ # Calculate questions per second
641
+ if len(mcqs) > 0:
642
+ qps = len(mcqs) / execution_time
643
+ questions_per_second.append(qps)
644
+
645
+ execution_times.append(np.mean(paragraph_times))
646
+
647
+ return {
648
+ 'avg_execution_time': np.mean(execution_times),
649
+ 'min_execution_time': np.min(execution_times),
650
+ 'max_execution_time': np.max(execution_times),
651
+ 'avg_questions_per_second': np.mean(questions_per_second) if questions_per_second else 0
652
+ }
653
+
654
+ def measure_memory_usage(self, paragraph, num_questions=5):
655
+ """Measure peak memory usage during MCQ generation"""
656
+ # Clear memory before test
657
+ gc.collect()
658
+
659
+ # Start memory tracking
660
+ tracemalloc.start()
661
+
662
+ # Generate MCQs
663
+ self.mcq_generator.generate_mcqs(paragraph, num_questions)
664
+
665
+ # Get peak memory usage
666
+ current, peak = tracemalloc.get_traced_memory()
667
+
668
+ # Stop tracking
669
+ tracemalloc.stop()
670
+
671
+ return {
672
+ 'current_memory_MB': current / (1024 * 1024),
673
+ 'peak_memory_MB': peak / (1024 * 1024)
674
+ }
675
+
676
+ def compute_semantic_similarity(self, text1, text2):
677
+ """Compute semantic similarity between two texts using TF-IDF and cosine similarity"""
678
+ try:
679
+ # Handle empty strings
680
+ if not text1.strip() or not text2.strip():
681
+ return 0
682
+
683
+ # Fit and transform the texts
684
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform([text1, text2])
685
+
686
+ # Compute cosine similarity
687
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
688
+ return similarity
689
+ except Exception as e:
690
+ print(f"Error computing semantic similarity: {e}")
691
+ return 0
692
+
693
+ def evaluate_question_quality(self, mcqs, reference_questions=None):
694
+ """Evaluate the quality of generated questions with improved reference handling"""
695
+ if not mcqs:
696
+ return {'avg_question_length': 0, 'has_question_mark': 0}
697
+
698
+ # Basic metrics
699
+ question_lengths = [len(mcq['question'].split()) for mcq in mcqs]
700
+ has_question_mark = [int(mcq['question'].endswith('?')) for mcq in mcqs]
701
+
702
+ # Option distinctiveness - average cosine distance between options
703
+ option_distinctiveness = []
704
+ for mcq in mcqs:
705
+ options = mcq['options']
706
+ if len(options) < 2:
707
+ continue
708
+
709
+ # Enhanced distinctiveness calculation using TF-IDF and cosine similarity
710
+ distinctiveness_scores = []
711
+ for i in range(len(options)):
712
+ for j in range(i+1, len(options)):
713
+ if not options[i].strip() or not options[j].strip():
714
+ continue
715
+
716
+ # Calculate semantic similarity between options
717
+ similarity = self.compute_semantic_similarity(options[i], options[j])
718
+ distinctiveness_scores.append(1 - similarity) # Higher is better (more distinct)
719
+
720
+ if distinctiveness_scores:
721
+ option_distinctiveness.append(np.mean(distinctiveness_scores))
722
+
723
+ # Compare with reference questions if provided
724
+ bleu_scores = []
725
+ modified_bleu_scores = [] # Using smoothing function
726
+ rouge_scores = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}
727
+ semantic_similarities = [] # New metric for semantic similarity
728
+
729
+ if reference_questions and len(reference_questions) > 0:
730
+ # Print debug info
731
+ print(f"Number of MCQs: {len(mcqs)}")
732
+ print(f"Number of reference questions: {len(reference_questions)}")
733
+
734
+ # Align MCQs with reference questions based on semantic similarity
735
+ aligned_pairs = []
736
+
737
+ if len(mcqs) <= len(reference_questions):
738
+ # If we have enough reference questions, find the best match for each MCQ
739
+ for mcq in mcqs:
740
+ best_match_idx = -1
741
+ best_similarity = -1
742
+
743
+ for i, ref in enumerate(reference_questions):
744
+ if i in [pair[1] for pair in aligned_pairs]:
745
+ continue # Skip already matched references
746
+
747
+ similarity = self.compute_semantic_similarity(
748
+ mcq['question'],
749
+ ref if isinstance(ref, str) else ""
750
+ )
751
+
752
+ if similarity > best_similarity:
753
+ best_similarity = similarity
754
+ best_match_idx = i
755
+
756
+ if best_match_idx >= 0:
757
+ aligned_pairs.append((mcq, best_match_idx))
758
+ else:
759
+ # If no match found, use the first available reference
760
+ for i, ref in enumerate(reference_questions):
761
+ if i not in [pair[1] for pair in aligned_pairs]:
762
+ aligned_pairs.append((mcq, i))
763
+ break
764
+ else:
765
+ # If we have more MCQs than references, match each reference to its best MCQ
766
+ used_mcqs = set()
767
+ for i, ref in enumerate(reference_questions):
768
+ best_match_idx = -1
769
+ best_similarity = -1
770
+
771
+ for j, mcq in enumerate(mcqs):
772
+ if j in used_mcqs:
773
+ continue # Skip already matched MCQs
774
+
775
+ similarity = self.compute_semantic_similarity(
776
+ mcq['question'],
777
+ ref if isinstance(ref, str) else ""
778
+ )
779
+
780
+ if similarity > best_similarity:
781
+ best_similarity = similarity
782
+ best_match_idx = j
783
+
784
+ if best_match_idx >= 0:
785
+ aligned_pairs.append((mcqs[best_match_idx], i))
786
+ used_mcqs.add(best_match_idx)
787
+
788
+ # Add remaining MCQs with cycling through references
789
+ for i, mcq in enumerate(mcqs):
790
+ if i not in used_mcqs:
791
+ ref_idx = i % len(reference_questions)
792
+ aligned_pairs.append((mcq, ref_idx))
793
+
794
+ # Calculate metrics for aligned pairs
795
+ for mcq, ref_idx in aligned_pairs:
796
+ reference = reference_questions[ref_idx] if isinstance(reference_questions[ref_idx], str) else ""
797
+
798
+ if not reference:
799
+ continue
800
+
801
+ ref_tokens = reference.split()
802
+ hyp_tokens = mcq['question'].split()
803
+
804
+ # Debug output
805
+ print(f"\nReference ({ref_idx}): {reference}")
806
+ print(f"Generated: {mcq['question']}")
807
+
808
+ # Calculate semantic similarity
809
+ sem_sim = self.compute_semantic_similarity(mcq['question'], reference)
810
+ semantic_similarities.append(sem_sim)
811
+ print(f"Semantic similarity: {sem_sim:.4f}")
812
+
813
+ try:
814
+ with warnings.catch_warnings():
815
+ warnings.simplefilter("ignore")
816
+
817
+ # Standard BLEU
818
+ bleu_score = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25))
819
+ bleu_scores.append(bleu_score)
820
+
821
+ # BLEU with smoothing to handle zero counts
822
+ modified_bleu = sentence_bleu(
823
+ [ref_tokens],
824
+ hyp_tokens,
825
+ weights=(0.25, 0.25, 0.25, 0.25),
826
+ smoothing_function=self.smoothing
827
+ )
828
+ modified_bleu_scores.append(modified_bleu)
829
+
830
+ print(f"Smoothed BLEU: {modified_bleu:.4f}")
831
+ except Exception as e:
832
+ print(f"BLEU score calculation error: {e}")
833
+
834
+ # ROUGE scores
835
+ try:
836
+ if len(reference) > 0 and len(mcq['question']) > 0:
837
+ rouge_result = self.rouge.get_scores(mcq['question'], reference)[0]
838
+ rouge_scores['rouge-1'].append(rouge_result['rouge-1']['f'])
839
+ rouge_scores['rouge-2'].append(rouge_result['rouge-2']['f'])
840
+ rouge_scores['rouge-l'].append(rouge_result['rouge-l']['f'])
841
+
842
+ print(f"ROUGE-1: {rouge_result['rouge-1']['f']:.4f}, ROUGE-L: {rouge_result['rouge-l']['f']:.4f}")
843
+ except Exception as e:
844
+ print(f"ROUGE score calculation error: {e}")
845
+
846
+ results = {
847
+ 'avg_question_length': np.mean(question_lengths),
848
+ 'has_question_mark': np.mean(has_question_mark) * 100, # as percentage
849
+ 'option_distinctiveness': np.mean(option_distinctiveness) if option_distinctiveness else 0
850
+ }
851
+
852
+ if modified_bleu_scores:
853
+ results['avg_smoothed_bleu_score'] = np.mean(modified_bleu_scores)
854
+
855
+ if semantic_similarities:
856
+ results['avg_semantic_similarity'] = np.mean(semantic_similarities)
857
+
858
+ for rouge_type, scores in rouge_scores.items():
859
+ if scores:
860
+ results[f'avg_{rouge_type}'] = np.mean(scores)
861
+
862
+ return results
863
+
864
+ def analyze_distractor_quality(self, mcqs, context):
865
+ """Analyze the quality of distractors with improved semantic analysis"""
866
+ if not mcqs:
867
+ return {}
868
+
869
+ # Check if distractor is in context
870
+ context_presence = []
871
+ semantic_relevance = [] # New metric for semantic relevance to context
872
+
873
+ for mcq in mcqs:
874
+ try:
875
+ correct_answer = mcq['options'][mcq['answer_index']]
876
+ distractors = [opt for i, opt in enumerate(mcq['options']) if i != mcq['answer_index']]
877
+
878
+ distractor_in_context = []
879
+ distractor_semantic_relevance = []
880
+
881
+ for distractor in distractors:
882
+ # Check semantic relevance to context
883
+ semantic_sim = self.compute_semantic_similarity(distractor, context)
884
+ distractor_semantic_relevance.append(semantic_sim)
885
+
886
+ # Traditional word overlap check
887
+ distractor_words = set(distractor.lower().split())
888
+ context_words = set(context.lower().split())
889
+
890
+ if distractor_words:
891
+ overlap_ratio = len(distractor_words.intersection(context_words)) / len(distractor_words)
892
+ distractor_in_context.append(overlap_ratio >= 0.5) # At least 50% of words in context
893
+
894
+ if distractor_in_context:
895
+ context_presence.append(sum(distractor_in_context) / len(distractor_in_context))
896
+
897
+ if distractor_semantic_relevance:
898
+ semantic_relevance.append(np.mean(distractor_semantic_relevance))
899
+ except Exception as e:
900
+ print(f"Error in distractor context analysis: {e}")
901
+
902
+ # Calculate semantic similarity between distractors and correct answer
903
+ distractor_answer_similarity = []
904
+ distractor_plausibility = [] # New metric for plausibility
905
+
906
+ for mcq in mcqs:
907
+ try:
908
+ correct_answer = mcq['options'][mcq['answer_index']]
909
+ distractors = [opt for i, opt in enumerate(mcq['options']) if i != mcq['answer_index']]
910
+
911
+ similarities = []
912
+ plausibility_scores = []
913
+
914
+ for distractor in distractors:
915
+ # Semantic similarity
916
+ similarity = self.compute_semantic_similarity(correct_answer, distractor)
917
+ similarities.append(similarity)
918
+
919
+ # Plausibility - should be somewhat similar to correct answer but not too similar
920
+ # Sweet spot is around 0.3-0.7 similarity
921
+ plausibility = 1.0 - abs(0.5 - similarity) # 1.0 at 0.5 similarity, decreasing on both sides
922
+ plausibility_scores.append(plausibility)
923
+
924
+ if similarities:
925
+ distractor_answer_similarity.append(np.mean(similarities))
926
+
927
+ if plausibility_scores:
928
+ distractor_plausibility.append(np.mean(plausibility_scores))
929
+ except Exception as e:
930
+ print(f"Error in distractor similarity analysis: {e}")
931
+
932
+ results = {
933
+ 'context_presence': np.mean(context_presence) * 100 if context_presence else 0, # as percentage
934
+ 'distractor_answer_similarity': np.mean(distractor_answer_similarity) * 100 if distractor_answer_similarity else 0 # as percentage
935
+ }
936
+
937
+ # Add new metrics
938
+ if semantic_relevance:
939
+ results['distractor_semantic_relevance'] = np.mean(semantic_relevance)
940
+
941
+ if distractor_plausibility:
942
+ results['distractor_plausibility'] = np.mean(distractor_plausibility)
943
+
944
+ return results
945
+
946
+ def calculate_readability_scores(self, mcqs):
947
+ """Calculate readability scores for questions"""
948
+ try:
949
+ import textstat
950
+ has_textstat = True
951
+ except ImportError:
952
+ has_textstat = False
953
+ print("textstat package not found - readability metrics will be skipped")
954
+ return {}
955
+
956
+ if not has_textstat or not mcqs:
957
+ return {}
958
+
959
+ readability_scores = {
960
+ 'flesch_reading_ease': [],
961
+ 'flesch_kincaid_grade': [],
962
+ 'automated_readability_index': [],
963
+ 'smog_index': [], # Added SMOG Index
964
+ 'coleman_liau_index': [] # Added Coleman-Liau Index
965
+ }
966
+
967
+ for mcq in mcqs:
968
+ question_text = mcq['question']
969
+
970
+ # Add options to create full MCQ text for readability analysis
971
+ full_mcq_text = question_text + "\n"
972
+ for i, option in enumerate(mcq['options']):
973
+ full_mcq_text += f"{chr(65+i)}. {option}\n"
974
+
975
+ try:
976
+ readability_scores['flesch_reading_ease'].append(textstat.flesch_reading_ease(full_mcq_text))
977
+ readability_scores['flesch_kincaid_grade'].append(textstat.flesch_kincaid_grade(full_mcq_text))
978
+ readability_scores['automated_readability_index'].append(textstat.automated_readability_index(full_mcq_text))
979
+ readability_scores['smog_index'].append(textstat.smog_index(full_mcq_text))
980
+ readability_scores['coleman_liau_index'].append(textstat.coleman_liau_index(full_mcq_text))
981
+ except Exception as e:
982
+ print(f"Error calculating readability: {e}")
983
+
984
+ result = {}
985
+ for metric, scores in readability_scores.items():
986
+ if scores:
987
+ result[f'avg_{metric}'] = np.mean(scores)
988
+
989
+ return result
990
+
991
+ def evaluate_question_diversity(self, mcqs):
992
+ """Evaluate the diversity of questions generated"""
993
+ if not mcqs or len(mcqs) < 2:
994
+ return {'question_diversity': 0}
995
+
996
+ # Calculate pairwise similarity between questions
997
+ similarities = []
998
+ for i in range(len(mcqs)):
999
+ for j in range(i+1, len(mcqs)):
1000
+ similarity = self.compute_semantic_similarity(mcqs[i]['question'], mcqs[j]['question'])
1001
+ similarities.append(similarity)
1002
+
1003
+ # Diversity is inverse of average similarity
1004
+ avg_similarity = np.mean(similarities) if similarities else 0
1005
+ diversity = 1 - avg_similarity
1006
+
1007
+ return {'question_diversity': diversity}
1008
+
1009
+ def evaluate_contextual_relevance(self, mcqs, context):
1010
+ """Evaluate how relevant questions are to the context"""
1011
+ if not mcqs:
1012
+ return {'contextual_relevance': 0}
1013
+
1014
+ relevance_scores = []
1015
+ for mcq in mcqs:
1016
+ # Calculate similarity between question and context
1017
+ similarity = self.compute_semantic_similarity(mcq['question'], context)
1018
+ relevance_scores.append(similarity)
1019
+
1020
+ return {'contextual_relevance': np.mean(relevance_scores) if relevance_scores else 0}
1021
+
1022
+ def evaluate(self, paragraphs, num_questions=5, reference_questions=None):
1023
+ """Run a comprehensive evaluation of the MCQ generator"""
1024
+ try:
1025
+ # Get one set of MCQs for quality evaluation
1026
+ sample_paragraph = paragraphs[0] if isinstance(paragraphs, list) else paragraphs
1027
+ sample_mcqs = self.mcq_generator.generate_mcqs(sample_paragraph, num_questions)
1028
+
1029
+ print(f"Generated {len(sample_mcqs)} MCQs for evaluation")
1030
+
1031
+ # Execution time
1032
+ timing_metrics = self.measure_execution_time(
1033
+ paragraphs if isinstance(paragraphs, list) else [paragraphs],
1034
+ num_questions
1035
+ )
1036
+
1037
+ # Memory usage
1038
+ memory_metrics = self.measure_memory_usage(sample_paragraph, num_questions)
1039
+
1040
+ # Question quality
1041
+ quality_metrics = self.evaluate_question_quality(sample_mcqs, reference_questions)
1042
+
1043
+ # Distractor quality
1044
+ distractor_metrics = self.analyze_distractor_quality(sample_mcqs, sample_paragraph)
1045
+
1046
+ # Readability metrics
1047
+ readability_metrics = self.calculate_readability_scores(sample_mcqs)
1048
+
1049
+ # New metrics
1050
+ diversity_metrics = self.evaluate_question_diversity(sample_mcqs)
1051
+ relevance_metrics = self.evaluate_contextual_relevance(sample_mcqs, sample_paragraph)
1052
+
1053
+ # Combine all metrics
1054
+ all_metrics = {
1055
+ **timing_metrics,
1056
+ **memory_metrics,
1057
+ **quality_metrics,
1058
+ **distractor_metrics,
1059
+ **readability_metrics,
1060
+ **diversity_metrics,
1061
+ **relevance_metrics
1062
+ }
1063
+
1064
+ return all_metrics
1065
+ except Exception as e:
1066
+ print(f"Error during evaluation: {e}")
1067
+ import traceback
1068
+ traceback.print_exc()
1069
+ return {"error": str(e)}
1070
+
1071
+ def visualize_results(self, metrics):
1072
+ """Visualize the evaluation results with enhanced charts"""
1073
+ try:
1074
+ # Create a dataframe for better display
1075
+ metrics_df = pd.DataFrame({k: [v] for k, v in metrics.items()})
1076
+
1077
+ # Format the numbers
1078
+ for col in metrics_df.columns:
1079
+ if 'time' in col:
1080
+ metrics_df[col] = metrics_df[col].round(2).astype(str) + ' sec'
1081
+ elif 'memory' in col:
1082
+ metrics_df[col] = metrics_df[col].round(2).astype(str) + ' MB'
1083
+ elif col in ['has_question_mark', 'context_presence', 'distractor_answer_similarity']:
1084
+ metrics_df[col] = metrics_df[col].round(1).astype(str) + '%'
1085
+ else:
1086
+ metrics_df[col] = metrics_df[col].round(3)
1087
+
1088
+ display(metrics_df.T.rename(columns={0: 'Value'}))
1089
+
1090
+ # Create enhanced visualizations
1091
+ fig = plt.figure(figsize=(16, 14))
1092
+
1093
+ # Create 3 rows, 2 columns for more organized charts
1094
+ gs = fig.add_gridspec(3, 2)
1095
+
1096
+ # Filter out metrics that shouldn't be plotted
1097
+ plottable_metrics = {k: v for k, v in metrics.items() if isinstance(v, (int, float))}
1098
+
1099
+ # 1. Performance Metrics
1100
+ ax1 = fig.add_subplot(gs[0, 0])
1101
+ performance_keys = ['avg_execution_time', 'avg_questions_per_second']
1102
+ performance_metrics = [plottable_metrics.get(k, 0) for k in performance_keys]
1103
+ bars = ax1.bar(performance_keys, performance_metrics, color=['#3498db', '#2ecc71'])
1104
+ ax1.set_title('Performance Metrics', fontsize=14, fontweight='bold')
1105
+ ax1.set_xticklabels(performance_keys, rotation=45, ha='right')
1106
+ # Add value labels on bars
1107
+ for bar in bars:
1108
+ height = bar.get_height()
1109
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1,
1110
+ f'{height:.2f}', ha='center', va='bottom')
1111
+
1112
+ # 2. Memory Usage
1113
+ ax2 = fig.add_subplot(gs[0, 1])
1114
+ memory_keys = ['current_memory_MB', 'peak_memory_MB']
1115
+ memory_metrics = [plottable_metrics.get(k, 0) for k in memory_keys]
1116
+ bars = ax2.bar(memory_keys, memory_metrics, color=['#9b59b6', '#34495e'])
1117
+ ax2.set_title('Memory Usage (MB)', fontsize=14, fontweight='bold')
1118
+ # Add value labels
1119
+ for bar in bars:
1120
+ height = bar.get_height()
1121
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
1122
+ f'{height:.2f}', ha='center', va='bottom')
1123
+
1124
+ # 3. Question Quality
1125
+ ax3 = fig.add_subplot(gs[1, 0])
1126
+ quality_keys = ['avg_question_length', 'has_question_mark', 'option_distinctiveness',
1127
+ 'question_diversity', 'contextual_relevance']
1128
+ quality_metrics = [
1129
+ plottable_metrics.get('avg_question_length', 0),
1130
+ plottable_metrics.get('has_question_mark', 0) / 100, # Convert from percentage
1131
+ plottable_metrics.get('option_distinctiveness', 0),
1132
+ plottable_metrics.get('question_diversity', 0),
1133
+ plottable_metrics.get('contextual_relevance', 0)
1134
+ ]
1135
+ bars = ax3.bar(['Avg Length', 'Question Mark', 'Option Distinct.', 'Diversity', 'Relevance'],
1136
+ quality_metrics, color=['#f39c12', '#d35400', '#c0392b', '#16a085', '#27ae60'])
1137
+ ax3.set_title('Question Quality Metrics', fontsize=14, fontweight='bold')
1138
+ ax3.set_xticklabels(['Avg Length', 'Question Mark', 'Option Distinct.', 'Diversity', 'Relevance'],
1139
+ rotation=45, ha='right')
1140
+ # Add value labels
1141
+ for bar in bars:
1142
+ height = bar.get_height()
1143
+ ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
1144
+ f'{height:.2f}', ha='center', va='bottom')
1145
+
1146
+ # 4. Distractor Quality
1147
+ ax4 = fig.add_subplot(gs[1, 1])
1148
+ distractor_keys = ['context_presence', 'distractor_answer_similarity',
1149
+ 'distractor_semantic_relevance', 'distractor_plausibility']
1150
+ distractor_metrics = [
1151
+ plottable_metrics.get('context_presence', 0) / 100, # Convert from percentage
1152
+ plottable_metrics.get('distractor_answer_similarity', 0) / 100, # Convert from percentage
1153
+ plottable_metrics.get('distractor_semantic_relevance', 0),
1154
+ plottable_metrics.get('distractor_plausibility', 0)
1155
+ ]
1156
+ bars = ax4.bar(['Context', 'Answer Sim.', 'Semantic Rel.', 'Plausibility'],
1157
+ distractor_metrics, color=['#1abc9c', '#e74c3c', '#3498db', '#f1c40f'])
1158
+ ax4.set_title('Distractor Quality Metrics', fontsize=14, fontweight='bold')
1159
+ ax4.set_xticklabels(['Context', 'Answer Sim.', 'Semantic Rel.', 'Plausibility'],
1160
+ rotation=45, ha='right')
1161
+ # Add value labels
1162
+ for bar in bars:
1163
+ height = bar.get_height()
1164
+ ax4.text(bar.get_x() + bar.get_width()/2., height + 0.01,
1165
+ f'{height:.2f}', ha='center', va='bottom')
1166
+
1167
+ # 5. NLP Metrics
1168
+ ax5 = fig.add_subplot(gs[2, 0])
1169
+ nlp_keys = ['avg_smoothed_bleu_score', 'avg_semantic_similarity',
1170
+ 'avg_rouge-1', 'avg_rouge-2', 'avg_rouge-l']
1171
+ nlp_metrics = [
1172
+ plottable_metrics.get('avg_smoothed_bleu_score', 0),
1173
+ plottable_metrics.get('avg_semantic_similarity', 0),
1174
+ plottable_metrics.get('avg_rouge-1', 0),
1175
+ plottable_metrics.get('avg_rouge-2', 0),
1176
+ plottable_metrics.get('avg_rouge-l', 0)
1177
+ ]
1178
+ bars = ax5.bar(['Smooth BLEU', 'Semantic', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
1179
+ nlp_metrics, color=['#3498db', '#2980b9', '#9b59b6', '#e74c3c', '#c0392b', '#d35400'])
1180
+ ax5.set_title('NLP Evaluation Metrics', fontsize=14, fontweight='bold')
1181
+ ax5.set_xticklabels(['Smooth BLEU', 'Semantic', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
1182
+ rotation=45, ha='right')
1183
+ # Add value labels
1184
+ for bar in bars:
1185
+ height = bar.get_height()
1186
+ ax5.text(bar.get_x() + bar.get_width()/2., height + 0.01,
1187
+ f'{height:.3f}', ha='center', va='bottom')
1188
+
1189
+ # 6. Readability Metrics
1190
+ ax6 = fig.add_subplot(gs[2, 1])
1191
+ readability_keys = ['avg_flesch_reading_ease', 'avg_flesch_kincaid_grade',
1192
+ 'avg_automated_readability_index', 'avg_smog_index', 'avg_coleman_liau_index']
1193
+ readability_metrics = [
1194
+ plottable_metrics.get('avg_flesch_reading_ease', 0),
1195
+ plottable_metrics.get('avg_flesch_kincaid_grade', 0),
1196
+ plottable_metrics.get('avg_automated_readability_index', 0),
1197
+ plottable_metrics.get('avg_smog_index', 0),
1198
+ plottable_metrics.get('avg_coleman_liau_index', 0)
1199
+ ]
1200
+ bars = ax6.bar(['Flesch Ease', 'Kincaid', 'ARI', 'SMOG', 'Coleman-Liau'],
1201
+ readability_metrics, color=['#27ae60', '#2ecc71', '#16a085', '#1abc9c', '#2980b9'])
1202
+ ax6.set_title('Readability Metrics', fontsize=14, fontweight='bold')
1203
+ ax6.set_xticklabels(['Flesch Ease', 'Kincaid', 'ARI', 'SMOG', 'Coleman-Liau'],
1204
+ rotation=45, ha='right')
1205
+ # Add value labels
1206
+ for bar in bars:
1207
+ height = bar.get_height()
1208
+ ax6.text(bar.get_x() + bar.get_width()/2., height + 0.1,
1209
+ f'{height:.2f}', ha='center', va='bottom')
1210
+
1211
+ plt.tight_layout()
1212
+ plt.show()
1213
+
1214
+ return fig
1215
+ except Exception as e:
1216
+ print(f"Error in visualization: {e}")
1217
+ import traceback
1218
+ traceback.print_exc()
1219
+
1220
+ # Example usage function with improved error handling
1221
+ def run_performance_evaluation():
1222
+ # Import the MCQ generator
1223
+ try:
1224
+ # First try to import from the module
1225
+ from improved_mcq_generator import ImprovedMCQGenerator
1226
+ except ImportError:
1227
+ # If that fails, try to load the class from current namespace
1228
+ try:
1229
+ # This assumes the class is defined in the current session
1230
+ ImprovedMCQGenerator = globals().get('ImprovedMCQGenerator')
1231
+ if ImprovedMCQGenerator is None:
1232
+ raise ImportError("ImprovedMCQGenerator class not found")
1233
+ except Exception as e:
1234
+ print(f"Error importing ImprovedMCQGenerator: {e}")
1235
+ return
1236
+
1237
+ # Test paragraphs - use a variety for better assessment
1238
+ test_paragraphs = [
1239
+ """The cell is the basic structural and functional unit of all living organisms. Cells can be classified into two main types: prokaryotic and eukaryotic.
1240
+ Prokaryotic cells, found in bacteria and archaea, lack a defined nucleus and membrane-bound organelles. In contrast, eukaryotic cells, which make up plants,
1241
+ animals, fungi, and protists, contain a nucleus that houses the cell’s DNA, as well as various organelles like mitochondria and the endoplasmic reticulum.
1242
+ The cell membrane regulates the movement of substances in and out of the cell, while the cytoplasm supports the internal structures."""
1243
+ ]
1244
+
1245
+ # Reference questions for comparison (optional)
1246
+ reference_questions = [
1247
+ "What do prokaryotic cells lack?",
1248
+ "Which cell structures are missing in prokaryotic cells compared to eukaryotic cells?",
1249
+ "What type of cells are found in bacteria and archaea?",
1250
+ "What is the basic structural and functional unit of all living organisms?",
1251
+ "What controls the movement of substances in and out of a cell?"
1252
+ ]
1253
+
1254
+
1255
+ try:
1256
+ # Initialize the MCQ generator
1257
+ mcq_generator = ImprovedMCQGenerator()
1258
+
1259
+ # Initialize performance metrics
1260
+ metrics_evaluator = MCQPerformanceMetrics(mcq_generator)
1261
+
1262
+ # Run evaluation
1263
+ print("Running performance evaluation...")
1264
+ results = metrics_evaluator.evaluate(test_paragraphs, num_questions=5, reference_questions=reference_questions)
1265
+
1266
+ # Visualize results
1267
+ metrics_evaluator.visualize_results(results)
1268
+
1269
+ # Print detailed results
1270
+ print("\nDetailed Performance Metrics:")
1271
+ for metric, value in results.items():
1272
+ # Format the value based on metric type
1273
+ if isinstance(value, (int, float)):
1274
+ if 'time' in metric:
1275
+ print(f"{metric}: {value:.2f} seconds")
1276
+ elif 'memory' in metric:
1277
+ print(f"{metric}: {value:.2f} MB")
1278
+ elif metric in ['has_question_mark', 'context_presence', 'distractor_answer_similarity']:
1279
+ print(f"{metric}: {value:.1f}%")
1280
+ else:
1281
+ print(f"{metric}: {value:.3f}")
1282
+ else:
1283
+ print(f"{metric}: {value}")
1284
+
1285
+ except Exception as e:
1286
+ print(f"Error in performance evaluation: {e}")
1287
+ import traceback
1288
+ traceback.print_exc()
1289
+
1290
+ if __name__ == "__main__":
1291
+ run_performance_evaluation()
mcq_gradio_app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from mcq_generator import ImprovedMCQGenerator, is_suitable_for_students
5
+ import io
6
+
7
+ # Load MCQ generator once
8
+ mcq_generator = ImprovedMCQGenerator()
9
+
10
+ def generate_mcqs_ui(paragraph, num_questions):
11
+ if not paragraph.strip():
12
+ return None, None, "⚠️ Please enter a valid paragraph."
13
+
14
+ if not is_suitable_for_students(paragraph):
15
+ return None, None, "❌ The paragraph is not suitable for MCQ generation (due to bias/toxicity/short length)."
16
+
17
+ try:
18
+ mcqs = mcq_generator.generate_mcqs(paragraph, num_questions)
19
+
20
+ # Create pretty formatted MCQ list
21
+ pretty_mcqs = []
22
+ for idx, mcq in enumerate(mcqs):
23
+ options = ""
24
+ for opt_idx, option in enumerate(mcq['options']):
25
+ options += f"<b>{chr(65+opt_idx)}.</b> {option}<br>"
26
+ question_html = f"<div style='margin-bottom:20px; padding:10px; border:1px solid #ccc; border-radius:10px; background:#f9f9f9;'>"
27
+ question_html += f"<b>Q{idx+1}:</b> {mcq['question']}<br><br>{options}"
28
+ question_html += f"<i><b>Answer:</b> {chr(65+mcq['answer_index'])}</i>"
29
+ question_html += "</div>"
30
+ pretty_mcqs.append(question_html)
31
+
32
+ # Prepare download files
33
+ txt_output = ""
34
+ csv_data = []
35
+
36
+ for idx, mcq in enumerate(mcqs):
37
+ txt_output += f"Q{idx+1}: {mcq['question']}\n"
38
+ for opt_idx, option in enumerate(mcq['options']):
39
+ txt_output += f" {chr(65+opt_idx)}. {option}\n"
40
+ txt_output += f"Answer: {chr(65+mcq['answer_index'])}\n\n"
41
+
42
+ csv_data.append({
43
+ 'Question': mcq['question'],
44
+ 'Option A': mcq['options'][0],
45
+ 'Option B': mcq['options'][1],
46
+ 'Option C': mcq['options'][2],
47
+ 'Option D': mcq['options'][3],
48
+ 'Answer': chr(65+mcq['answer_index'])
49
+ })
50
+
51
+ # Create file objects
52
+ txt_file = io.BytesIO(txt_output.encode('utf-8'))
53
+ csv_file = io.BytesIO()
54
+ pd.DataFrame(csv_data).to_csv(csv_file, index=False)
55
+ csv_file.seek(0)
56
+
57
+ return pretty_mcqs, [("mcqs.txt", txt_file), ("mcqs.csv", csv_file)], "βœ… MCQs generated successfully!"
58
+
59
+ except Exception as e:
60
+ return None, None, f"❌ Error generating MCQs: {str(e)}"
61
+
62
+ # Gradio Interface
63
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
64
+ gr.Markdown("<h1 style='text-align:center;'>πŸ“š Smart MCQ Generator</h1>")
65
+ with gr.Row():
66
+ paragraph_input = gr.Textbox(lines=8, label="Enter Paragraph for MCQs", placeholder="Paste your study material here...")
67
+ with gr.Row():
68
+ num_questions_slider = gr.Slider(1, 10, step=1, value=5, label="Number of Questions")
69
+ with gr.Row():
70
+ generate_btn = gr.Button("πŸš€ Generate MCQs")
71
+ status = gr.Textbox(label="Status", interactive=False)
72
+
73
+ with gr.Row():
74
+ mcq_output = gr.HTML()
75
+
76
+ with gr.Row():
77
+ download_output = gr.File(label="Download MCQs (TXT/CSV)")
78
+
79
+ generate_btn.click(
80
+ fn=generate_mcqs_ui,
81
+ inputs=[paragraph_input, num_questions_slider],
82
+ outputs=[mcq_output, download_output, status]
83
+ )
84
+
85
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ nltk
5
+ scikit-learn
6
+ pandas