import os import re import PyPDF2 import docx import googleapiclient.discovery import nltk from nltk.tokenize import sent_tokenize from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer from youtube_transcript_api import YouTubeTranscriptApi import streamlit as st import pandas as pd import random from io import StringIO import logging # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Download necessary NLTK resources nltk.download('punkt') nltk.download('averaged_perceptron_tagger') class QuizGenerator: def __init__(self): # Initialize the summarizer and question generator models self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Load question generation model self.qg_model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl") self.qg_tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl") # Initialize MCQ generation components self.qa_model = pipeline('question-answering', model='distilbert-base-cased-distilled-squad') def extract_text_from_pdf(self, pdf_file): """Extract text from a PDF file.""" try: text = "" pdf_reader = PyPDF2.PdfReader(pdf_file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: logger.error(f"Error extracting text from PDF: {e}") return "" def extract_text_from_docx(self, docx_file): """Extract text from a DOCX file.""" try: doc = docx.Document(docx_file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text except Exception as e: logger.error(f"Error extracting text from DOCX: {e}") return "" def extract_text_from_txt(self, txt_file): """Extract text from a TXT file.""" try: return txt_file.read().decode('utf-8') except Exception as e: logger.error(f"Error extracting text from TXT: {e}") return "" def get_youtube_transcript(self, video_id): """Extract transcript from a YouTube video.""" try: transcript_list = YouTubeTranscriptApi.get_transcript(video_id) transcript = ' '.join([item['text'] for item in transcript_list]) return transcript except Exception as e: logger.error(f"Error getting YouTube transcript: {e}") return "" def summarize_text(self, text, max_length=1000): """Summarize long text to make processing more efficient.""" if len(text) <= max_length: return text chunks = self._split_text_into_chunks(text, max_length=3000) summaries = [] for chunk in chunks: if len(chunk) < 100: # Skip chunks that are too small continue summary = self.summarizer(chunk, max_length=300, min_length=100, do_sample=False) summaries.append(summary[0]['summary_text']) return " ".join(summaries) def _split_text_into_chunks(self, text, max_length=3000): """Split text into chunks of max_length characters.""" sentences = sent_tokenize(text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= max_length: current_chunk += " " + sentence else: chunks.append(current_chunk.strip()) current_chunk = sentence if current_chunk: chunks.append(current_chunk.strip()) return chunks def generate_questions(self, text, num_questions=5): """Generate questions based on the input text.""" try: # Summarize text if it's too long processed_text = self.summarize_text(text) # Split into sentences sentences = sent_tokenize(processed_text) questions = [] random.shuffle(sentences) # Randomize to get different questions each time for sentence in sentences[:min(num_questions * 3, len(sentences))]: # Process more sentences than needed if len(sentence.split()) < 5: # Skip short sentences continue # Format for the question generation model input_text = f"generate question: {sentence}" # Generate question inputs = self.qg_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True) outputs = self.qg_model.generate(inputs, max_length=64, num_beams=4, early_stopping=True) question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True) # Use QA model to get answer qa_input = { 'question': question, 'context': processed_text } answer = self.qa_model(qa_input) if answer['score'] > 0.1: # Only keep questions with reasonable confidence questions.append({ 'question': question, 'answer': answer['answer'], 'context': sentence }) if len(questions) >= num_questions: break return questions except Exception as e: logger.error(f"Error generating questions: {e}") return [] def generate_mcq(self, questions, num_options=4): """Convert open-ended questions to multiple-choice questions.""" mcqs = [] for q in questions: correct_answer = q['answer'] # Generate distractors (incorrect options) distractors = self._generate_distractors(q['context'], correct_answer, num_options-1) # Create options list with correct answer options = distractors + [correct_answer] random.shuffle(options) # Find position of correct answer correct_index = options.index(correct_answer) mcqs.append({ 'question': q['question'], 'options': options, 'correct_answer': correct_answer, 'correct_index': correct_index }) return mcqs def _generate_distractors(self, context, correct_answer, num_distractors=3): """Generate plausible but incorrect answers.""" # Simple approach - extract other nouns from the text words = nltk.word_tokenize(context) pos_tags = nltk.pos_tag(words) # Extract nouns and named entities nouns = [word for word, pos in pos_tags if pos in ('NN', 'NNS', 'NNP', 'NNPS') and word.lower() != correct_answer.lower()] # Deduplicate and filter unique_nouns = list(set(nouns)) distractors = [noun for noun in unique_nouns if len(noun) > 2] # If we don't have enough distractors, add some generic ones generic_distractors = ["None of the above", "Cannot be determined", "All of the above"] # Combine and return required number combined = list(distractors) + generic_distractors random.shuffle(combined) return combined[:num_distractors] def generate_true_false(self, text, num_questions=5): """Generate true/false questions from text.""" try: # Generate factual statements first questions = self.generate_questions(text, num_questions) true_false = [] for q in questions: # Original statement is true true_statement = { 'statement': q['context'], 'is_true': True } # Create a false version by negating or changing key parts words = q['context'].split() if len(words) > 4: # Simple modification: replace a word or add a negation change_idx = random.randint(0, len(words)-1) words[change_idx] = random.choice(["not", "never", "rarely", "incorrectly"]) + " " + words[change_idx] false_statement = { 'statement': " ".join(words), 'is_true': False } true_false.extend([true_statement, false_statement]) # Shuffle and return required number random.shuffle(true_false) return true_false[:num_questions] except Exception as e: logger.error(f"Error generating true/false questions: {e}") return [] def create_streamlit_app(): st.set_page_config(page_title="QuizWhiz", page_icon="📚", layout="wide") st.title("QuizWhiz - Comprehensive Quiz Generator") st.subheader("Generate quizzes from various sources: text, documents, and YouTube videos") quiz_gen = QuizGenerator() # Sidebar for options st.sidebar.header("Quiz Options") quiz_type = st.sidebar.selectbox( "Question Type", ["Multiple Choice", "True/False", "Open-Ended"] ) num_questions = st.sidebar.slider("Number of Questions", 3, 20, 5) # Source selection st.header("Select Your Content Source") source_type = st.radio( "Content Source", ["Text Input", "Document Upload", "YouTube Video", "Topic/Subject"] ) text_content = "" # Handle different source types if source_type == "Text Input": text_content = st.text_area("Enter your text content here:", height=250) elif source_type == "Document Upload": uploaded_file = st.file_uploader("Upload your document", type=['pdf', 'docx', 'txt']) if uploaded_file is not None: st.success(f"File '{uploaded_file.name}' uploaded successfully!") # Extract text based on file type if uploaded_file.name.endswith('.pdf'): text_content = quiz_gen.extract_text_from_pdf(uploaded_file) elif uploaded_file.name.endswith('.docx'): text_content = quiz_gen.extract_text_from_docx(uploaded_file) elif uploaded_file.name.endswith('.txt'): text_content = quiz_gen.extract_text_from_txt(uploaded_file) # Show text preview if text_content: with st.expander("Preview Extracted Text"): st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content) else: st.error("Failed to extract text from the document.") elif source_type == "YouTube Video": youtube_url = st.text_input("Enter YouTube Video URL:") if youtube_url: # Extract video ID from URL video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url) if video_id_match: video_id = video_id_match.group(1) # Show video embed st.video(youtube_url) # Extract transcript with st.spinner("Extracting video transcript..."): text_content = quiz_gen.get_youtube_transcript(video_id) if text_content: with st.expander("Preview Transcript"): st.text(text_content[:500] + "..." if len(text_content) > 500 else text_content) else: st.error("Failed to extract transcript. This video might not have captions.") else: st.error("Invalid YouTube URL. Please enter a valid URL.") elif source_type == "Topic/Subject": topic = st.text_input("Enter a topic or subject:") if topic: # For this demo, we'll use a predefined text about the topic # In a real app, you might use an API to fetch content about the topic st.info(f"Generating quiz about: {topic}") text_content = f"The topic of {topic} is a fascinating subject to explore. " \ f"There are many important concepts and facts related to {topic} " \ f"that make it an essential area of study. Understanding {topic} " \ f"requires careful consideration of its key principles." # Placeholder for a real implementation that would gather information about the topic st.warning("In a complete implementation, this would gather information about the topic from reliable sources.") # Generate Quiz Button if text_content: if st.button("Generate Quiz"): with st.spinner("Generating quiz questions..."): if quiz_type == "Multiple Choice": # Generate questions first questions = quiz_gen.generate_questions(text_content, num_questions) # Convert to MCQs mcqs = quiz_gen.generate_mcq(questions) if mcqs: st.success(f"Generated {len(mcqs)} multiple choice questions!") # Display questions for i, q in enumerate(mcqs, 1): st.subheader(f"Question {i}: {q['question']}") # Display options option_letters = ['A', 'B', 'C', 'D'] for j, option in enumerate(q['options']): st.write(f"{option_letters[j]}. {option}") # Reveal answer in expander with st.expander("Reveal Answer"): st.write(f"Correct Answer: {option_letters[q['correct_index']]}. {q['correct_answer']}") st.divider() else: st.error("Failed to generate questions. Try with different content.") elif quiz_type == "True/False": tf_questions = quiz_gen.generate_true_false(text_content, num_questions) if tf_questions: st.success(f"Generated {len(tf_questions)} true/false questions!") # Display questions for i, q in enumerate(tf_questions, 1): st.subheader(f"Question {i}: True or False?") st.write(q['statement']) # Reveal answer with st.expander("Reveal Answer"): st.write(f"Answer: {'True' if q['is_true'] else 'False'}") st.divider() else: st.error("Failed to generate true/false questions. Try with different content.") elif quiz_type == "Open-Ended": questions = quiz_gen.generate_questions(text_content, num_questions) if questions: st.success(f"Generated {len(questions)} open-ended questions!") # Display questions for i, q in enumerate(questions, 1): st.subheader(f"Question {i}: {q['question']}") # Reveal answer with st.expander("Reveal Answer"): st.write(f"Suggested Answer: {q['answer']}") st.write(f"Context: {q['context']}") st.divider() else: st.error("Failed to generate questions. Try with different content.") else: st.info("Please provide content or select a source to generate a quiz.") # Footer st.sidebar.divider() st.sidebar.caption("QuizWhiz - Powered by AI") st.sidebar.caption("© 2025 QuizWhiz Technologies") if __name__ == "__main__": create_streamlit_app()