Spaces:
Sleeping
Sleeping
import os | |
import re | |
import PyPDF2 | |
import docx | |
import googleapiclient.discovery | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import streamlit as st | |
import pandas as pd | |
import random | |
from io import StringIO | |
import logging | |
# Setup logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Download necessary NLTK resources | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
class QuizGenerator: | |
def __init__(self): | |
# Initialize the summarizer and question generator models | |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# Load question generation model | |
self.qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl") | |
self.qg_tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl") | |
# Initialize MCQ generation components | |
self.qa_model = pipeline('question-answering', model='distilbert-base-cased-distilled-squad') | |
def extract_text_from_pdf(self, pdf_file): | |
"""Extract text from a PDF file.""" | |
try: | |
text = "" | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
logger.error(f"Error extracting text from PDF: {e}") | |
return "" | |
def extract_text_from_docx(self, docx_file): | |
"""Extract text from a DOCX file.""" | |
try: | |
doc = docx.Document(docx_file) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
except Exception as e: | |
logger.error(f"Error extracting text from DOCX: {e}") | |
return "" | |
def extract_text_from_txt(self, txt_file): | |
"""Extract text from a TXT file.""" | |
try: | |
return txt_file.read().decode('utf-8') | |
except Exception as e: | |
logger.error(f"Error extracting text from TXT: {e}") | |
return "" | |
def get_youtube_transcript(self, video_id): | |
"""Extract transcript from a YouTube video.""" | |
try: | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
transcript = ' '.join([item['text'] for item in transcript_list]) | |
return transcript | |
except Exception as e: | |
logger.error(f"Error getting YouTube transcript: {e}") | |
return "" | |
def summarize_text(self, text, max_length=1000): | |
"""Summarize long text to make processing more efficient.""" | |
if len(text) <= max_length: | |
return text | |
chunks = self._split_text_into_chunks(text, max_length=3000) | |
summaries = [] | |
for chunk in chunks: | |
if len(chunk) < 100: # Skip chunks that are too small | |
continue | |
summary = self.summarizer(chunk, max_length=300, min_length=100, do_sample=False) | |
summaries.append(summary[0]['summary_text']) | |
return " ".join(summaries) | |
def _split_text_into_chunks(self, text, max_length=3000): | |
"""Split text into chunks of max_length characters.""" | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(current_chunk) + len(sentence) <= max_length: | |
current_chunk += " " + sentence | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
def generate_questions(self, text, num_questions=5): | |
"""Generate questions based on the input text.""" | |
try: | |
# Summarize text if it's too long | |
processed_text = self.summarize_text(text) | |
# Split into sentences | |
sentences = sent_tokenize(processed_text) | |
questions = [] | |
random.shuffle(sentences) # Randomize to get different questions each time | |
for sentence in sentences[:min(num_questions * 3, len(sentences))]: # Process more sentences than needed | |
if len(sentence.split()) < 5: # Skip short sentences | |
continue | |
# Format for the question generation model | |
input_text = f"generate question: {sentence}" | |
# Generate question | |
inputs = self.qg_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True) | |
outputs = self.qg_model.generate(inputs, max_length=64, num_beams=4, early_stopping=True) | |
question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Use QA model to get answer | |
qa_input = { | |
'question': question, | |
'context': processed_text | |
} | |
answer = self.qa_model(qa_input) | |
if answer['score'] > 0.1: # Only keep questions with reasonable confidence | |
questions.append({ | |
'question': question, | |
'answer': answer['answer'], | |
'context': sentence | |
}) | |
if len(questions) >= num_questions: | |
break | |
return questions | |
except Exception as e: | |
logger.error(f"Error generating questions: {e}") | |
return [] | |
def generate_mcq(self, questions, num_options=4): | |
"""Convert open-ended questions to multiple-choice questions.""" | |
mcqs = [] | |
for q in questions: | |
correct_answer = q['answer'] | |
# Generate distractors (incorrect options) | |
distractors = self._generate_distractors(q['context'], correct_answer, num_options-1) | |
# Create options list with correct answer | |
options = distractors + [correct_answer] | |
random.shuffle(options) | |
# Find position of correct answer | |
correct_index = options.index(correct_answer) | |
mcqs.append({ | |
'question': q['question'], | |
'options': options, | |
'correct_answer': correct_answer, | |
'correct_index': correct_index | |
}) | |
return mcqs | |
def _generate_distractors(self, context, correct_answer, num_distractors=3): | |
"""Generate plausible but incorrect answers.""" | |
# Simple approach - extract other nouns from the text | |
words = nltk.word_tokenize(context) | |
pos_tags = nltk.pos_tag(words) | |
# Extract nouns and named entities | |
nouns = [word for word, pos in pos_tags if pos in ('NN', 'NNS', 'NNP', 'NNPS') and word.lower() != correct_answer.lower()] | |
# Deduplicate and filter | |
unique_nouns = list(set(nouns)) | |
distractors = [noun for noun in unique_nouns if len(noun) > 2] | |
# If we don't have enough distractors, add some generic ones | |
generic_distractors = ["None of the above", "Cannot be determined", "All of the above"] | |
# Combine and return required number | |
combined = list(distractors) + generic_distractors | |
random.shuffle(combined) | |
return combined[:num_distractors] | |
def generate_true_false(self, text, num_questions=5): | |
"""Generate true/false questions from text.""" | |
try: | |
# Generate factual statements first | |
questions = self.generate_questions(text, num_questions) | |
true_false = [] | |
for q in questions: | |
# Original statement is true | |
true_statement = { | |
'statement': q['context'], | |
'is_true': True | |
} | |
# Create a false version by negating or changing key parts | |
words = q['context'].split() | |
if len(words) > 4: | |
# Simple modification: replace a word or add a negation | |
change_idx = random.randint(0, len(words)-1) | |
words[change_idx] = random.choice(["not", "never", "rarely", "incorrectly"]) + " " + words[change_idx] | |
false_statement = { | |
'statement': " ".join(words), | |
'is_true': False | |
} | |
true_false.extend([true_statement, false_statement]) | |
# Shuffle and return required number | |
random.shuffle(true_false) | |
return true_false[:num_questions] | |
except Exception as e: | |
logger.error(f"Error generating true/false questions: {e}") | |
return [] | |
def create_streamlit_app(): | |
st.set_page_config(page_title="QuizWhiz", page_icon="📚", layout="wide") | |
st.title("QuizWhiz - Comprehensive Quiz Generator") | |
st.subheader("Generate quizzes from various sources: text, documents, and YouTube videos") | |
quiz_gen = QuizGenerator() | |
# Sidebar for options | |
st.sidebar.header("Quiz Options") | |
quiz_type = st.sidebar.selectbox( | |
"Question Type", | |
["Multiple Choice", "True/False", "Open-Ended"] | |
) | |
num_questions = st.sidebar.slider("Number of Questions", 3, 20, 5) | |
# Source selection | |
st.header("Select Your Content Source") | |
source_type = st.radio( | |
"Content Source", | |
["Text Input", "Document Upload", "YouTube Video", "Topic/Subject"] | |
) | |
text_content = "" | |
# Handle different source types | |
if source_type == "Text Input": | |
text_content = st.text_area("Enter your text content here:", height=250) | |
elif source_type == "Document Upload": | |
uploaded_file = st.file_uploader("Upload your document", type=['pdf', 'docx', 'txt']) | |
if uploaded_file is not None: | |
st.success(f"File '{uploaded_file.name}' uploaded successfully!") | |
# Extract text based on file type | |
if uploaded_file.name.endswith('.pdf'): | |
text_content = quiz_gen.extract_text_from_pdf(uploaded_file) | |
elif uploaded_file.name.endswith('.docx'): | |
text_content = quiz_gen.extract_text_from_docx(uploaded_file) | |
elif uploaded_file.name.endswith('.txt'): | |
text_content = quiz_gen.extract_text_from_txt(uploaded_file) | |
# Show text preview | |
if text_content: | |
with st.expander("Preview Extracted Text"): | |
st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content) | |
else: | |
st.error("Failed to extract text from the document.") | |
elif source_type == "YouTube Video": | |
youtube_url = st.text_input("Enter YouTube Video URL:") | |
if youtube_url: | |
# Extract video ID from URL | |
video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url) | |
if video_id_match: | |
video_id = video_id_match.group(1) | |
# Show video embed | |
st.video(youtube_url) | |
# Extract transcript | |
with st.spinner("Extracting video transcript..."): | |
text_content = quiz_gen.get_youtube_transcript(video_id) | |
if text_content: | |
with st.expander("Preview Transcript"): | |
st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content) | |
else: | |
st.error("Failed to extract transcript. This video might not have captions.") | |
else: | |
st.error("Invalid YouTube URL. Please enter a valid URL.") | |
elif source_type == "Topic/Subject": | |
topic = st.text_input("Enter a topic or subject:") | |
if topic: | |
# For this demo, we'll use a predefined text about the topic | |
# In a real app, you might use an API to fetch content about the topic | |
st.info(f"Generating quiz about: {topic}") | |
text_content = f"The topic of {topic} is a fascinating subject to explore. " \ | |
f"There are many important concepts and facts related to {topic} " \ | |
f"that make it an essential area of study. Understanding {topic} " \ | |
f"requires careful consideration of its key principles." | |
# Placeholder for a real implementation that would gather information about the topic | |
st.warning("In a complete implementation, this would gather information about the topic from reliable sources.") | |
# Generate Quiz Button | |
if text_content: | |
if st.button("Generate Quiz"): | |
with st.spinner("Generating quiz questions..."): | |
if quiz_type == "Multiple Choice": | |
# Generate questions first | |
questions = quiz_gen.generate_questions(text_content, num_questions) | |
# Convert to MCQs | |
mcqs = quiz_gen.generate_mcq(questions) | |
if mcqs: | |
st.success(f"Generated {len(mcqs)} multiple choice questions!") | |
# Display questions | |
for i, q in enumerate(mcqs, 1): | |
st.subheader(f"Question {i}: {q['question']}") | |
# Display options | |
option_letters = ['A', 'B', 'C', 'D'] | |
for j, option in enumerate(q['options']): | |
st.write(f"{option_letters[j]}. {option}") | |
# Reveal answer in expander | |
with st.expander("Reveal Answer"): | |
st.write(f"Correct Answer: {option_letters[q['correct_index']]}. {q['correct_answer']}") | |
st.divider() | |
else: | |
st.error("Failed to generate questions. Try with different content.") | |
elif quiz_type == "True/False": | |
tf_questions = quiz_gen.generate_true_false(text_content, num_questions) | |
if tf_questions: | |
st.success(f"Generated {len(tf_questions)} true/false questions!") | |
# Display questions | |
for i, q in enumerate(tf_questions, 1): | |
st.subheader(f"Question {i}: True or False?") | |
st.write(q['statement']) | |
# Reveal answer | |
with st.expander("Reveal Answer"): | |
st.write(f"Answer: {'True' if q['is_true'] else 'False'}") | |
st.divider() | |
else: | |
st.error("Failed to generate true/false questions. Try with different content.") | |
elif quiz_type == "Open-Ended": | |
questions = quiz_gen.generate_questions(text_content, num_questions) | |
if questions: | |
st.success(f"Generated {len(questions)} open-ended questions!") | |
# Display questions | |
for i, q in enumerate(questions, 1): | |
st.subheader(f"Question {i}: {q['question']}") | |
# Reveal answer | |
with st.expander("Reveal Answer"): | |
st.write(f"Suggested Answer: {q['answer']}") | |
st.write(f"Context: {q['context']}") | |
st.divider() | |
else: | |
st.error("Failed to generate questions. Try with different content.") | |
else: | |
st.info("Please provide content or select a source to generate a quiz.") | |
# Footer | |
st.sidebar.divider() | |
st.sidebar.caption("QuizWhiz - Powered by AI") | |
st.sidebar.caption("© 2025 QuizWhiz Technologies") | |
if __name__ == "__main__": | |
create_streamlit_app() |