Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pdfplumber | |
from PIL import Image | |
import io | |
import re | |
import random | |
from transformers import pipeline | |
# Load question generation pipeline | |
# Using valhalla/t5-base-qg-hl for question generation with highlighting support | |
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl") | |
# Load summarization pipeline for key sentence extraction (to identify key concepts) | |
summarizer = pipeline("summarization") | |
def extract_text_from_pdf(file_bytes): | |
try: | |
text = "" | |
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
for page in pdf.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
# Do not fallback on OCR because pytesseract requires system installation | |
return text | |
except Exception as e: | |
return "" | |
def extract_text_from_image(file_bytes): | |
# OCR disabled due to system dependencies on Tesseract | |
return "OCR not supported in this environment. Please upload a PDF or TXT file containing selectable text." | |
def extract_text_from_txt(file_bytes): | |
try: | |
text = file_bytes.decode("utf-8") | |
except UnicodeDecodeError: | |
text = file_bytes.decode("latin-1") | |
return text | |
def clean_text(text): | |
# Clean excessive new lines and spaces | |
text = re.sub(r'\n+', '\n', text) | |
text = re.sub(r'[ ]{2,}', ' ', text) | |
return text.strip() | |
def split_to_sentences(text): | |
# Simple split by periods, question marks, and exclamation | |
sentences = re.split(r'(?<=[.?!])\s+', text) | |
return [s.strip() for s in sentences if s.strip()] | |
def highlight_answer_in_context(context, answer): | |
# Highlight answer in context for the qg model input format | |
# The model uses <hl> tokens to highlight answer: context <hl> answer <hl> | |
# We find answer in context and mark it | |
# If no direct answer found, just return context unchanged | |
idx = context.lower().find(answer.lower()) | |
if idx != -1: | |
part1 = context[:idx] | |
part2 = context[idx+len(answer):] | |
return f"{part1.strip()} <hl> {answer.strip()} <hl> {part2.strip()}" | |
else: | |
return context | |
def generate_mcq(question_text): | |
''' | |
Generate MCQ with 1 correct + 3 incorrect options. | |
Since no direct distractor generation model, we'll generate distractors by rephrasing or random shuffling. | |
Here, for demonstration, we create options by slight modifications to the correct answer. | |
''' | |
correct_answer = question_text | |
# Generate plausible options by shuffling words or changing order | |
words = correct_answer.split() | |
options = set() | |
options.add(correct_answer) | |
while len(options) < 4: | |
if len(words) > 1: | |
shuffled = words[:] | |
random.shuffle(shuffled) | |
option = ' '.join(shuffled) | |
if option.lower() != correct_answer.lower(): | |
options.add(option) | |
else: | |
# If single word, generate random similar words (basic approach) | |
option = correct_answer + random.choice(['.', ',', '?', '!']) | |
options.add(option) | |
options = list(options) | |
random.shuffle(options) | |
# Determine the letter of correct answer | |
correct_letter = 'ABCD'[options.index(correct_answer)] | |
return options, correct_letter | |
def generate_questions_mcq(context, num_questions): | |
''' | |
Generate MCQ questions based on context | |
''' | |
sentences = split_to_sentences(context) | |
questions_structured = [] | |
used_questions = set() | |
# Limit candidates to first 15 sentences for speed | |
candidates = sentences[:15] | |
for i, sentence in enumerate(candidates): | |
# Attempt to generate question for candidate sentence as answer | |
input_text = highlight_answer_in_context(context, sentence) | |
question = qg_pipeline(input_text, max_length=64)[0]['generated_text'] | |
if question in used_questions or not question.endswith('?'): | |
continue | |
used_questions.add(question) | |
options, correct_letter = generate_mcq(sentence) | |
questions_structured.append({ | |
"question": question, | |
"options": options, | |
"correct_letter": correct_letter, | |
"correct_answer": sentence, | |
"explanation": f"Answer explanation: {sentence}" | |
}) | |
if len(questions_structured) >= num_questions: | |
break | |
if not questions_structured: | |
# fallback question if no generation | |
question = "What is the main topic discussed in the content?" | |
options = ["Option A", "Option B", "Option C", "Option D"] | |
questions_structured.append({ | |
"question": question, | |
"options": options, | |
"correct_letter": "A", | |
"correct_answer": "Option A", | |
"explanation": "Fallback explanation." | |
}) | |
return questions_structured | |
def generate_questions_subjective(context, num_questions): | |
''' | |
Generate subjective questions based on context, use summarization for answers | |
''' | |
sentences = split_to_sentences(context) | |
questions_structured = [] | |
used_questions = set() | |
candidates = sentences[:20] | |
for i, sentence in enumerate(candidates): | |
input_text = highlight_answer_in_context(context, sentence) | |
question = qg_pipeline(input_text, max_length=64)[0]['generated_text'] | |
if question in used_questions or not question.endswith('?'): | |
continue | |
used_questions.add(question) | |
# Brief answer by summarizing sentence or context snippet | |
answer = sentence | |
questions_structured.append({ | |
"question": question, | |
"answer": answer | |
}) | |
if len(questions_structured) >= num_questions: | |
break | |
if not questions_structured: | |
questions_structured.append({ | |
"question": "Describe the main topic discussed in the content.", | |
"answer": "The main topic is an overview of the content provided." | |
}) | |
return questions_structured | |
def format_mcq_output(questions): | |
output = "" | |
for idx, q in enumerate(questions, 1): | |
output += f"- Q{idx}: {q['question']}\n" | |
ops = ['A', 'B', 'C', 'D'] | |
for opt_idx, option in enumerate(q['options']): | |
output += f" - {ops[opt_idx]}. {option}\n" | |
output += f"- Correct Answer: {q['correct_letter']}\n" | |
output += f"- Explanation: {q['explanation']}\n\n" | |
return output.strip() | |
def format_subjective_output(questions): | |
output = "" | |
for idx, q in enumerate(questions, 1): | |
output += f"- Q{idx}: {q['question']}\n" | |
output += f"- Suggested Answer: {q['answer']}\n\n" | |
return output.strip() | |
def main_process(file, question_type, num_questions): | |
if not file: | |
return "Please upload a file." | |
file_bytes = file.read() | |
fname = file.name.lower() | |
extracted_text = "" | |
if fname.endswith(".pdf"): | |
extracted_text = extract_text_from_pdf(file_bytes) | |
elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")): | |
# OCR unsupported fallback message | |
extracted_text = extract_text_from_image(file_bytes) | |
elif fname.endswith(".txt"): | |
extracted_text = extract_text_from_txt(file_bytes) | |
else: | |
return "Unsupported file type. Please upload PDF, Image, or TXT." | |
extracted_text = clean_text(extracted_text) | |
if len(extracted_text) < 30: | |
return "Extracted text is too short or empty. Please check your input file." | |
if question_type == "MCQ": | |
questions = generate_questions_mcq(extracted_text, num_questions) | |
output = format_mcq_output(questions) | |
else: | |
questions = generate_questions_subjective(extracted_text, num_questions) | |
output = format_subjective_output(questions) | |
return output | |
with gr.Blocks(css=""" | |
#header { | |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
font-weight: 700; | |
font-size: 28px; | |
text-align: center; | |
margin-bottom: 20px; | |
color: #333; | |
} | |
#footer { | |
font-size: 12px; | |
color: #666; | |
margin-top: 30px; | |
text-align: center; | |
} | |
.output-area { | |
white-space: pre-wrap; | |
background-color: #f3f4f6; | |
padding: 15px; | |
border-radius: 8px; | |
font-family: monospace; | |
max-height: 450px; | |
overflow-y: auto; | |
} | |
.gr-button { | |
background-color: #4f46e5; | |
color: white; | |
font-weight: bold; | |
border-radius: 8px; | |
} | |
.gr-button:hover { | |
background-color: #4338ca; | |
} | |
""") as demo: | |
gr.Markdown("<div id='header'>π Study Content Question Generator</div>") | |
with gr.Row(): | |
file_input = gr.File(label="Upload PDF, Image, or Text file", type="file") | |
with gr.Column(): | |
question_type = gr.Radio(choices=["MCQ", "Subjective"], label="Question Type", value="MCQ") | |
num_questions = gr.Slider(1, 10, value=5, step=1, label="Number of Questions") | |
generate_btn = gr.Button("Generate Questions") | |
output = gr.Textbox(label="Generated Questions", lines=20, interactive=False, elem_classes="output-area") | |
generate_btn.click(fn=main_process, inputs=[file_input, question_type, num_questions], outputs=output) | |
gr.Markdown("<div id='footer'>Made with β€οΈ using Hugging Face Spaces and Transformers</div>") | |
if __name__ == "__main__": | |
demo.launch() | |