quiz_generator / app.py
Rupesx007's picture
updated app.py
b69329e verified
import os
import re
import PyPDF2
import docx
import googleapiclient.discovery
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from youtube_transcript_api import YouTubeTranscriptApi
import streamlit as st
import pandas as pd
import random
from io import StringIO
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
class QuizGenerator:
def __init__(self):
# Initialize the summarizer and question generator models
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Load question generation model
self.qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")
self.qg_tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
# Initialize MCQ generation components
self.qa_model = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')
def extract_text_from_pdf(self, pdf_file):
"""Extract text from a PDF file."""
try:
text = ""
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_docx(self, docx_file):
"""Extract text from a DOCX file."""
try:
doc = docx.Document(docx_file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
logger.error(f"Error extracting text from DOCX: {e}")
return ""
def extract_text_from_txt(self, txt_file):
"""Extract text from a TXT file."""
try:
return txt_file.read().decode('utf-8')
except Exception as e:
logger.error(f"Error extracting text from TXT: {e}")
return ""
def get_youtube_transcript(self, video_id):
"""Extract transcript from a YouTube video."""
try:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
transcript = ' '.join([item['text'] for item in transcript_list])
return transcript
except Exception as e:
logger.error(f"Error getting YouTube transcript: {e}")
return ""
def summarize_text(self, text, max_length=1000):
"""Summarize long text to make processing more efficient."""
if len(text) <= max_length:
return text
chunks = self._split_text_into_chunks(text, max_length=3000)
summaries = []
for chunk in chunks:
if len(chunk) < 100: # Skip chunks that are too small
continue
summary = self.summarizer(chunk, max_length=300, min_length=100, do_sample=False)
summaries.append(summary[0]['summary_text'])
return " ".join(summaries)
def _split_text_into_chunks(self, text, max_length=3000):
"""Split text into chunks of max_length characters."""
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_length:
current_chunk += " " + sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def generate_questions(self, text, num_questions=5):
"""Generate questions based on the input text."""
try:
# Summarize text if it's too long
processed_text = self.summarize_text(text)
# Split into sentences
sentences = sent_tokenize(processed_text)
questions = []
random.shuffle(sentences) # Randomize to get different questions each time
for sentence in sentences[:min(num_questions * 3, len(sentences))]: # Process more sentences than needed
if len(sentence.split()) < 5: # Skip short sentences
continue
# Format for the question generation model
input_text = f"generate question: {sentence}"
# Generate question
inputs = self.qg_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
outputs = self.qg_model.generate(inputs, max_length=64, num_beams=4, early_stopping=True)
question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Use QA model to get answer
qa_input = {
'question': question,
'context': processed_text
}
answer = self.qa_model(qa_input)
if answer['score'] > 0.1: # Only keep questions with reasonable confidence
questions.append({
'question': question,
'answer': answer['answer'],
'context': sentence
})
if len(questions) >= num_questions:
break
return questions
except Exception as e:
logger.error(f"Error generating questions: {e}")
return []
def generate_mcq(self, questions, num_options=4):
"""Convert open-ended questions to multiple-choice questions."""
mcqs = []
for q in questions:
correct_answer = q['answer']
# Generate distractors (incorrect options)
distractors = self._generate_distractors(q['context'], correct_answer, num_options-1)
# Create options list with correct answer
options = distractors + [correct_answer]
random.shuffle(options)
# Find position of correct answer
correct_index = options.index(correct_answer)
mcqs.append({
'question': q['question'],
'options': options,
'correct_answer': correct_answer,
'correct_index': correct_index
})
return mcqs
def _generate_distractors(self, context, correct_answer, num_distractors=3):
"""Generate plausible but incorrect answers."""
# Simple approach - extract other nouns from the text
words = nltk.word_tokenize(context)
pos_tags = nltk.pos_tag(words)
# Extract nouns and named entities
nouns = [word for word, pos in pos_tags if pos in ('NN', 'NNS', 'NNP', 'NNPS') and word.lower() != correct_answer.lower()]
# Deduplicate and filter
unique_nouns = list(set(nouns))
distractors = [noun for noun in unique_nouns if len(noun) > 2]
# If we don't have enough distractors, add some generic ones
generic_distractors = ["None of the above", "Cannot be determined", "All of the above"]
# Combine and return required number
combined = list(distractors) + generic_distractors
random.shuffle(combined)
return combined[:num_distractors]
def generate_true_false(self, text, num_questions=5):
"""Generate true/false questions from text."""
try:
# Generate factual statements first
questions = self.generate_questions(text, num_questions)
true_false = []
for q in questions:
# Original statement is true
true_statement = {
'statement': q['context'],
'is_true': True
}
# Create a false version by negating or changing key parts
words = q['context'].split()
if len(words) > 4:
# Simple modification: replace a word or add a negation
change_idx = random.randint(0, len(words)-1)
words[change_idx] = random.choice(["not", "never", "rarely", "incorrectly"]) + " " + words[change_idx]
false_statement = {
'statement': " ".join(words),
'is_true': False
}
true_false.extend([true_statement, false_statement])
# Shuffle and return required number
random.shuffle(true_false)
return true_false[:num_questions]
except Exception as e:
logger.error(f"Error generating true/false questions: {e}")
return []
def create_streamlit_app():
st.set_page_config(page_title="QuizWhiz", page_icon="📚", layout="wide")
st.title("QuizWhiz - Comprehensive Quiz Generator")
st.subheader("Generate quizzes from various sources: text, documents, and YouTube videos")
quiz_gen = QuizGenerator()
# Sidebar for options
st.sidebar.header("Quiz Options")
quiz_type = st.sidebar.selectbox(
"Question Type",
["Multiple Choice", "True/False", "Open-Ended"]
)
num_questions = st.sidebar.slider("Number of Questions", 3, 20, 5)
# Source selection
st.header("Select Your Content Source")
source_type = st.radio(
"Content Source",
["Text Input", "Document Upload", "YouTube Video", "Topic/Subject"]
)
text_content = ""
# Handle different source types
if source_type == "Text Input":
text_content = st.text_area("Enter your text content here:", height=250)
elif source_type == "Document Upload":
uploaded_file = st.file_uploader("Upload your document", type=['pdf', 'docx', 'txt'])
if uploaded_file is not None:
st.success(f"File '{uploaded_file.name}' uploaded successfully!")
# Extract text based on file type
if uploaded_file.name.endswith('.pdf'):
text_content = quiz_gen.extract_text_from_pdf(uploaded_file)
elif uploaded_file.name.endswith('.docx'):
text_content = quiz_gen.extract_text_from_docx(uploaded_file)
elif uploaded_file.name.endswith('.txt'):
text_content = quiz_gen.extract_text_from_txt(uploaded_file)
# Show text preview
if text_content:
with st.expander("Preview Extracted Text"):
st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content)
else:
st.error("Failed to extract text from the document.")
elif source_type == "YouTube Video":
youtube_url = st.text_input("Enter YouTube Video URL:")
if youtube_url:
# Extract video ID from URL
video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url)
if video_id_match:
video_id = video_id_match.group(1)
# Show video embed
st.video(youtube_url)
# Extract transcript
with st.spinner("Extracting video transcript..."):
text_content = quiz_gen.get_youtube_transcript(video_id)
if text_content:
with st.expander("Preview Transcript"):
st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content)
else:
st.error("Failed to extract transcript. This video might not have captions.")
else:
st.error("Invalid YouTube URL. Please enter a valid URL.")
elif source_type == "Topic/Subject":
topic = st.text_input("Enter a topic or subject:")
if topic:
# For this demo, we'll use a predefined text about the topic
# In a real app, you might use an API to fetch content about the topic
st.info(f"Generating quiz about: {topic}")
text_content = f"The topic of {topic} is a fascinating subject to explore. " \
f"There are many important concepts and facts related to {topic} " \
f"that make it an essential area of study. Understanding {topic} " \
f"requires careful consideration of its key principles."
# Placeholder for a real implementation that would gather information about the topic
st.warning("In a complete implementation, this would gather information about the topic from reliable sources.")
# Generate Quiz Button
if text_content:
if st.button("Generate Quiz"):
with st.spinner("Generating quiz questions..."):
if quiz_type == "Multiple Choice":
# Generate questions first
questions = quiz_gen.generate_questions(text_content, num_questions)
# Convert to MCQs
mcqs = quiz_gen.generate_mcq(questions)
if mcqs:
st.success(f"Generated {len(mcqs)} multiple choice questions!")
# Display questions
for i, q in enumerate(mcqs, 1):
st.subheader(f"Question {i}: {q['question']}")
# Display options
option_letters = ['A', 'B', 'C', 'D']
for j, option in enumerate(q['options']):
st.write(f"{option_letters[j]}. {option}")
# Reveal answer in expander
with st.expander("Reveal Answer"):
st.write(f"Correct Answer: {option_letters[q['correct_index']]}. {q['correct_answer']}")
st.divider()
else:
st.error("Failed to generate questions. Try with different content.")
elif quiz_type == "True/False":
tf_questions = quiz_gen.generate_true_false(text_content, num_questions)
if tf_questions:
st.success(f"Generated {len(tf_questions)} true/false questions!")
# Display questions
for i, q in enumerate(tf_questions, 1):
st.subheader(f"Question {i}: True or False?")
st.write(q['statement'])
# Reveal answer
with st.expander("Reveal Answer"):
st.write(f"Answer: {'True' if q['is_true'] else 'False'}")
st.divider()
else:
st.error("Failed to generate true/false questions. Try with different content.")
elif quiz_type == "Open-Ended":
questions = quiz_gen.generate_questions(text_content, num_questions)
if questions:
st.success(f"Generated {len(questions)} open-ended questions!")
# Display questions
for i, q in enumerate(questions, 1):
st.subheader(f"Question {i}: {q['question']}")
# Reveal answer
with st.expander("Reveal Answer"):
st.write(f"Suggested Answer: {q['answer']}")
st.write(f"Context: {q['context']}")
st.divider()
else:
st.error("Failed to generate questions. Try with different content.")
else:
st.info("Please provide content or select a source to generate a quiz.")
# Footer
st.sidebar.divider()
st.sidebar.caption("QuizWhiz - Powered by AI")
st.sidebar.caption("© 2025 QuizWhiz Technologies")
if __name__ == "__main__":
create_streamlit_app()