mocktestgen commited on
Commit
facb671
Β·
verified Β·
1 Parent(s): 5fb7468

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +287 -0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ from PIL import Image
4
+ import pytesseract
5
+ import io
6
+ import re
7
+ import random
8
+
9
+ from transformers import pipeline
10
+
11
+ # Load question generation pipeline
12
+ # Using valhalla/t5-base-qg-hl for question generation with highlighting support
13
+ qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
14
+
15
+ # Load summarization pipeline for key sentence extraction (to identify key concepts)
16
+ summarizer = pipeline("summarization")
17
+
18
+ def extract_text_from_pdf(file_bytes):
19
+ try:
20
+ text = ""
21
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
22
+ for page in pdf.pages:
23
+ page_text = page.extract_text()
24
+ if page_text:
25
+ text += page_text + "\n"
26
+ # If extracted text is empty, fallback to OCR per page
27
+ if not text.strip():
28
+ text = ocr_pdf(file_bytes)
29
+ return text
30
+ except Exception as e:
31
+ return ""
32
+
33
+ def ocr_pdf(file_bytes):
34
+ text = ""
35
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
36
+ for page in pdf.pages:
37
+ # Convert page to image
38
+ pil_image = page.to_image(resolution=300).original
39
+ # OCR
40
+ page_text = pytesseract.image_to_string(pil_image)
41
+ text += page_text + "\n"
42
+ return text
43
+
44
+ def extract_text_from_image(file_bytes):
45
+ image = Image.open(io.BytesIO(file_bytes))
46
+ text = pytesseract.image_to_string(image)
47
+ return text
48
+
49
+ def extract_text_from_txt(file_bytes):
50
+ try:
51
+ text = file_bytes.decode("utf-8")
52
+ except UnicodeDecodeError:
53
+ text = file_bytes.decode("latin-1")
54
+ return text
55
+
56
+ def clean_text(text):
57
+ # Clean excessive new lines and spaces
58
+ text = re.sub(r'\n+', '\n', text)
59
+ text = re.sub(r'[ ]{2,}', ' ', text)
60
+ return text.strip()
61
+
62
+ def split_to_sentences(text):
63
+ # Simple split by periods, question marks, and exclamation
64
+ sentences = re.split(r'(?<=[.?!])\s+', text)
65
+ return [s.strip() for s in sentences if s.strip()]
66
+
67
+ def highlight_answer_in_context(context, answer):
68
+ # Highlight answer in context for the qg model input format
69
+ # The model uses <hl> tokens to highlight answer: context <hl> answer <hl>
70
+ # We find answer in context and mark it
71
+ # If no direct answer found, just return context unchanged
72
+ idx = context.lower().find(answer.lower())
73
+ if idx != -1:
74
+ part1 = context[:idx]
75
+ part2 = context[idx+len(answer):]
76
+ return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}"
77
+ else:
78
+ return context
79
+
80
+ def generate_mcq(question_text):
81
+ '''
82
+ Generate MCQ with 1 correct + 3 incorrect options.
83
+ Since no direct distractor generation model, we'll generate distractors by rephrasing or random shuffling.
84
+ Here, for demonstration, we create options by slight modifications to the correct answer.
85
+ '''
86
+ correct_answer = question_text
87
+
88
+ # Generate plausible options by shuffling words or changing order
89
+ words = correct_answer.split()
90
+ options = set()
91
+ options.add(correct_answer)
92
+
93
+ while len(options) < 4:
94
+ if len(words) > 1:
95
+ shuffled = words[:]
96
+ random.shuffle(shuffled)
97
+ option = ' '.join(shuffled)
98
+ if option.lower() != correct_answer.lower():
99
+ options.add(option)
100
+ else:
101
+ # If single word, generate random similar words (basic approach)
102
+ option = correct_answer + random.choice(['.', ',', '?', '!'])
103
+ options.add(option)
104
+
105
+ options = list(options)
106
+ random.shuffle(options)
107
+
108
+ # Determine the letter of correct answer
109
+ correct_letter = 'ABCD'[options.index(correct_answer)]
110
+
111
+ return options, correct_letter
112
+
113
+ def generate_questions_mcq(context, num_questions):
114
+ '''
115
+ Generate MCQ questions based on context
116
+ '''
117
+ sentences = split_to_sentences(context)
118
+ questions_structured = []
119
+ used_questions = set()
120
+
121
+ # Limit candidates to first 15 sentences for speed
122
+ candidates = sentences[:15]
123
+
124
+ for i, sentence in enumerate(candidates):
125
+ # Attempt to generate question for candidate sentence as answer
126
+ input_text = highlight_answer_in_context(context, sentence)
127
+ question = qg_pipeline(input_text, max_length=64)[0]['generated_text']
128
+ if question in used_questions or not question.endswith('?'):
129
+ continue
130
+ used_questions.add(question)
131
+ options, correct_letter = generate_mcq(sentence)
132
+ questions_structured.append({
133
+ "question": question,
134
+ "options": options,
135
+ "correct_letter": correct_letter,
136
+ "correct_answer": sentence,
137
+ "explanation": f"Answer explanation: {sentence}"
138
+ })
139
+ if len(questions_structured) >= num_questions:
140
+ break
141
+
142
+ if not questions_structured:
143
+ # fallback question if no generation
144
+ question = "What is the main topic discussed in the content?"
145
+ options = ["Option A", "Option B", "Option C", "Option D"]
146
+ questions_structured.append({
147
+ "question": question,
148
+ "options": options,
149
+ "correct_letter": "A",
150
+ "correct_answer": "Option A",
151
+ "explanation": "Fallback explanation."
152
+ })
153
+
154
+ return questions_structured
155
+
156
+ def generate_questions_subjective(context, num_questions):
157
+ '''
158
+ Generate subjective questions based on context, use summarization for answers
159
+ '''
160
+ sentences = split_to_sentences(context)
161
+ questions_structured = []
162
+ used_questions = set()
163
+
164
+ candidates = sentences[:20]
165
+
166
+ for i, sentence in enumerate(candidates):
167
+ input_text = highlight_answer_in_context(context, sentence)
168
+ question = qg_pipeline(input_text, max_length=64)[0]['generated_text']
169
+ if question in used_questions or not question.endswith('?'):
170
+ continue
171
+ used_questions.add(question)
172
+
173
+ # Brief answer by summarizing sentence or context snippet
174
+ answer = sentence
175
+ questions_structured.append({
176
+ "question": question,
177
+ "answer": answer
178
+ })
179
+ if len(questions_structured) >= num_questions:
180
+ break
181
+ if not questions_structured:
182
+ questions_structured.append({
183
+ "question": "Describe the main topic discussed in the content.",
184
+ "answer": "The main topic is an overview of the content provided."
185
+ })
186
+
187
+ return questions_structured
188
+
189
+ def format_mcq_output(questions):
190
+ output = ""
191
+ for idx, q in enumerate(questions, 1):
192
+ output += f"- Q{idx}: {q['question']}\n"
193
+ ops = ['A', 'B', 'C', 'D']
194
+ for opt_idx, option in enumerate(q['options']):
195
+ output += f" - {ops[opt_idx]}. {option}\n"
196
+ output += f"- Correct Answer: {q['correct_letter']}\n"
197
+ output += f"- Explanation: {q['explanation']}\n\n"
198
+ return output.strip()
199
+
200
+ def format_subjective_output(questions):
201
+ output = ""
202
+ for idx, q in enumerate(questions, 1):
203
+ output += f"- Q{idx}: {q['question']}\n"
204
+ output += f"- Suggested Answer: {q['answer']}\n\n"
205
+ return output.strip()
206
+
207
+ def main_process(file, question_type, num_questions):
208
+ if not file:
209
+ return "Please upload a file."
210
+
211
+ file_bytes = file.read()
212
+ fname = file.name.lower()
213
+
214
+ extracted_text = ""
215
+
216
+ if fname.endswith(".pdf"):
217
+ extracted_text = extract_text_from_pdf(file_bytes)
218
+ elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
219
+ extracted_text = extract_text_from_image(file_bytes)
220
+ elif fname.endswith(".txt"):
221
+ extracted_text = extract_text_from_txt(file_bytes)
222
+ else:
223
+ return "Unsupported file type. Please upload PDF, Image, or TXT."
224
+
225
+ extracted_text = clean_text(extracted_text)
226
+
227
+ if len(extracted_text) < 30:
228
+ return "Extracted text is too short or empty. Please check your input file."
229
+
230
+ if question_type == "MCQ":
231
+ questions = generate_questions_mcq(extracted_text, num_questions)
232
+ output = format_mcq_output(questions)
233
+ else:
234
+ questions = generate_questions_subjective(extracted_text, num_questions)
235
+ output = format_subjective_output(questions)
236
+
237
+ return output
238
+
239
+ with gr.Blocks(css="""
240
+ #header {
241
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
242
+ font-weight: 700;
243
+ font-size: 28px;
244
+ text-align: center;
245
+ margin-bottom: 20px;
246
+ color: #333;
247
+ }
248
+ #footer {
249
+ font-size: 12px;
250
+ color: #666;
251
+ margin-top: 30px;
252
+ text-align: center;
253
+ }
254
+ .output-area {
255
+ white-space: pre-wrap;
256
+ background-color: #f3f4f6;
257
+ padding: 15px;
258
+ border-radius: 8px;
259
+ font-family: monospace;
260
+ max-height: 450px;
261
+ overflow-y: auto;
262
+ }
263
+ .gr-button {
264
+ background-color: #4f46e5;
265
+ color: white;
266
+ font-weight: bold;
267
+ border-radius: 8px;
268
+ }
269
+ .gr-button:hover {
270
+ background-color: #4338ca;
271
+ }
272
+ """) as demo:
273
+ gr.Markdown("<div id='header'>πŸ“š Study Content Question Generator</div>")
274
+ with gr.Row():
275
+ file_input = gr.File(label="Upload PDF, Image, or Text file", type="file")
276
+ with gr.Column():
277
+ question_type = gr.Radio(choices=["MCQ", "Subjective"], label="Question Type", value="MCQ")
278
+ num_questions = gr.Slider(1, 10, value=5, step=1, label="Number of Questions")
279
+ generate_btn = gr.Button("Generate Questions")
280
+ output = gr.Textbox(label="Generated Questions", lines=20, interactive=False, elem_classes="output-area")
281
+
282
+ generate_btn.click(fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output)
283
+
284
+ gr.Markdown("<div id='footer'>Made with ❀️ using Hugging Face Spaces and Transformers</div>")
285
+
286
+ if __name__ == "__main__":
287
+ demo.launch()