Spaces:

Natwar
/

Fine_Tuned_Phi2_QnA_Chatbot

Sleeping

File size: 30,497 Bytes
import os
import subprocess
import sys
import warnings
import logging
from typing import List, Dict, Any, Optional
import tempfile
import re
import time
import gc
import spaces

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings("ignore")

def install_package(package: str, version: Optional[str] = None) -> None:
    """Install a Python package if not already installed"""
    package_spec = f"{package}=={version}" if version else package
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
        print(f"Successfully installed {package_spec}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package_spec}: {e}")
        raise

# Required packages - install these before importing
required_packages = {
    "torch": None,
    "gradio": "3.10.1",
    "transformers": None,
    "peft": None,
    "bitsandbytes": None,
    "PyPDF2": None,
    "python-docx": None,
    "accelerate": None,
    "sentencepiece": None,
}

# Install required packages BEFORE importing them
for package, version in required_packages.items():
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package, version)

# Now we can safely import all required modules
import torch
import transformers
import gradio as gr
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, TrainerCallback,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
import PyPDF2
import docx
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset as TorchDataset

# Suppress transformers warnings
transformers.logging.set_verbosity_error()

# Check GPU availability
if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"GPU found: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    DEVICE = "cpu"
    print("No GPU found, using CPU. Fine-tuning will be much slower.")
    print("For better performance, use Google Colab with GPU runtime (Runtime > Change runtime type > GPU)")

# Constants specific to Phi-2
MODEL_KEY = "microsoft/phi-2"
MAX_SEQ_LEN = 512  # Reduced from 1024 for much lighter memory usage
# FIX: Updated target modules for Phi-2
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "dense"]  # Correct modules for Phi-2

# Initialize model and tokenizer
model = None
tokenizer = None
fine_tuned_model = None
document_text = ""  # Store document content for context

def load_base_model() -> str:
    """Load Phi-2 with 8-bit quantization instead of 4-bit for faster training"""
    global model, tokenizer

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

    try:
        # Use 8-bit quantization (faster to train than 4-bit)
        if DEVICE == "cuda":
            bnb_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0,
                llm_int8_has_fp16_weight=False
            )
        else:
            bnb_config = None

        # Load tokenizer with Phi-2 specific settings
        print("Loading Phi-2 tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_KEY,
            trust_remote_code=True,
            padding_side="right"
        )

        # Ensure pad token is properly set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load model with Phi-2 specific configuration
        print("Loading Phi-2 model... (this may take a few minutes)")
        if DEVICE == "cuda":
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_KEY,
                quantization_config=bnb_config,
                device_map="auto",
                torch_dtype=torch.float16,
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_KEY,
                torch_dtype=torch.float32,
                trust_remote_code=True,
                low_cpu_mem_usage=True
            ).to(DEVICE)

        print("Phi-2 (2.7B) model loaded successfully!")
        return "Phi-2 (2.7B) model loaded successfully! Ready to process documents."

    except Exception as e:
        error_msg = f"Error loading model: {str(e)}"
        print(error_msg)
        return error_msg

def phi2_prompt_template(context: str, question: str) -> str:
    """
    Create a prompt optimized for Phi-2
    Phi-2 responds well to clear instruction formatting
    """
    return f"""Instruction: Answer the question accurately based on the context provided.
Context: {context}
Question: {question}
Answer:"""

def process_pdf(file_path: str) -> str:
    """Extract text from PDF file"""
    text = ""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
            # Process at most 30 pages to avoid memory issues
            pages_to_process = min(total_pages, 30)
            for i in range(pages_to_process):
                page = pdf_reader.pages[i]
                page_text = page.extract_text() or ""
                text += page_text + "\n"

            if total_pages > pages_to_process:
                text += f"\n[Note: Only the first {pages_to_process} pages were processed due to size limitations.]"
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
    return text

def process_docx(file_path: str) -> str:
    """Extract text from DOCX file"""
    try:
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error processing DOCX: {str(e)}")
        return ""

def process_txt(file_path: str) -> str:
    """Extract text from TXT file"""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
        return text
    except Exception as e:
        print(f"Error processing TXT: {str(e)}")
        return ""

def preprocess_text(text: str) -> str:
    """Clean and preprocess text"""
    if not text:
        return ""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters that may cause issues
    text = re.sub(r'[^\w\s.,;:!?\'\"()-]', '', text)
    return text.strip()

def get_semantic_chunks(text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
    """More efficient semantic chunking"""
    if not text:
        return []

    # Simple sentence splitting for speed
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        words = sentence.split()
        if current_length + len(words) <= chunk_size:
            current_chunk.append(sentence)
            current_length += len(words)
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = len(words)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    # Limit to just 5 chunks for much faster processing
    if len(chunks) > 5:
        indices = np.linspace(0, len(chunks)-1, 5, dtype=int)
        chunks = [chunks[i] for i in indices]

    return chunks

def create_qa_dataset(document_chunks: List[str]) -> List[Dict[str, str]]:
    """Create comprehensive QA pairs from document chunks for better fine-tuning"""
    qa_pairs = []

    # Document-level questions
    full_text = " ".join(document_chunks[:5])  # Use beginning of document for overview
    qa_pairs.append({
        "question": "What is this document about?",
        "context": full_text,
        "answer": "Based on my analysis, this document discusses..."  # Empty template for model to learn
    })

    qa_pairs.append({
        "question": "Summarize the key points of this document.",
        "context": full_text,
        "answer": "The key points of this document are..."
    })

    # Process each chunk for specific QA pairs
    for i, chunk in enumerate(document_chunks):
        if not chunk or len(chunk) < 100:  # Skip very short chunks
            continue

        # Context-specific questions
        chunk_index = i + 1  # 1-indexed for readability

        # Basic factual questions about chunk content
        qa_pairs.append({
            "question": f"What information is contained in section {chunk_index}?",
            "context": chunk,
            "answer": f"Section {chunk_index} contains information about..."
        })

        # Entity-based questions - find names, organizations, technical terms
        entities = set(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', chunk))
        technical_terms = set(re.findall(r'\b[A-Za-z]+-?[A-Za-z]+\b', chunk))

        # Filter to meaningful entities (longer than 3 chars)
        entities = [e for e in entities if len(e) > 3][:2]  # Limit to 2 entity questions per chunk

        for entity in entities:
            qa_pairs.append({
                "question": f"What does the document say about {entity}?",
                "context": chunk,
                "answer": f"Regarding {entity}, the document states that..."
            })

        # Specific content questions
        sentences = re.split(r'(?<=[.!?])\s+', chunk)
        key_sentences = [s for s in sentences if len(s.split()) > 8][:2]  # Focus on substantive sentences

        for sentence in key_sentences:
            # Create question from sentence by identifying subject
            subject_match = re.search(r'^(The|A|An|This|These|Those|Some|Any|Many|Few|All|Most)?\s*([A-Za-z\s]+?)\s+(is|are|was|were|has|have|had|can|could|will|would|may|might)', sentence, re.IGNORECASE)
            if subject_match:
                subject = subject_match.group(2).strip()
                if len(subject) > 2:
                    qa_pairs.append({
                        "question": f"What information is provided about {subject}?",
                        "context": chunk,
                        "answer": sentence
                    })

        # Add relationship questions between concepts
        if i < len(document_chunks) - 1:
            next_chunk = document_chunks[i+1]
            qa_pairs.append({
                "question": f"How does the information in section {chunk_index} relate to section {chunk_index+1}?",
                "context": chunk + " " + next_chunk,
                "answer": f"Section {chunk_index} discusses... while section {chunk_index+1} covers... The relationship between them is..."
            })

    # Limit to 5 examples max for lighter memory usage
    if len(qa_pairs) > 5:
        import random
        random.shuffle(qa_pairs)
        qa_pairs = qa_pairs[:5]

    return qa_pairs

class QADataset(TorchDataset):
    """PyTorch dataset specialized for Phi-2 QA fine-tuning"""
    def __init__(self, qa_pairs: List[Dict[str, str]], tokenizer, max_length: int = MAX_SEQ_LEN):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Verify dataset structure
        self.validate_dataset()

    def validate_dataset(self):
        """Verify that the dataset has proper structure"""
        if not self.qa_pairs:
            print("Warning: Empty dataset!")
            return

        required_keys = ["question", "context", "answer"]
        for i, item in enumerate(self.qa_pairs[:5]):  # Check first 5 examples
            missing = [k for k in required_keys if k not in item]
            if missing:
                print(f"Warning: Example {i} missing keys: {missing}")

            # Check for empty values
            empty = [k for k in required_keys if k in item and not item[k]]
            if empty:
                print(f"Warning: Example {i} has empty values for: {empty}")

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        qa_pair = self.qa_pairs[idx]

        # Format prompt using Phi-2 template
        context = qa_pair['context']
        question = qa_pair['question']
        answer = qa_pair['answer']

        # Build Phi-2 specific prompt
        prompt = phi2_prompt_template(context, question)

        # Concatenate prompt and answer
        sequence = f"{prompt} {answer}"

        try:
            # Tokenize with proper handling
            encoded = self.tokenizer(
                sequence,
                truncation=True,
                max_length=self.max_length,
                padding="max_length",
                return_tensors="pt"
            )

            # Extract tensors
            input_ids = encoded["input_ids"].squeeze(0)
            attention_mask = encoded["attention_mask"].squeeze(0)

            # Create labels
            labels = input_ids.clone()

            # Calculate prompt length accurately
            prompt_encoded = self.tokenizer(prompt, add_special_tokens=False)
            prompt_length = len(prompt_encoded["input_ids"])

            # Ensure prompt_length doesn't exceed labels length
            prompt_length = min(prompt_length, len(labels))

            # Set labels for prompt portion to -100 (ignored in loss calculation)
            labels[:prompt_length] = -100

            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            }

        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            # Return dummy sample as fallback
            return {
                "input_ids": torch.zeros(self.max_length, dtype=torch.long),
                "attention_mask": torch.zeros(self.max_length, dtype=torch.long),
                "labels": torch.zeros(self.max_length, dtype=torch.long)
            }

def clear_gpu_memory():
    """Clear GPU memory to prevent OOM errors"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

class ProgressCallback(TrainerCallback):
    def __init__(self, progress, status_box=None):
        self.progress = progress
        self.status_box = status_box
        self.current_step = 0
        self.total_steps = 0

    def on_train_begin(self, args, state, control, **kwargs):
        self.total_steps = state.max_steps

    def on_step_end(self, args, state, control, **kwargs):
        self.current_step = state.global_step
        progress_percent = self.current_step / self.total_steps
        self.progress(0.4 + (0.5 * progress_percent),
                     desc=f"Epoch {state.epoch}/{args.num_train_epochs} | Step {self.current_step}/{self.total_steps}")
        if self.status_box:
            self.status_box.update(f"Training in progress: Epoch {state.epoch}/{args.num_train_epochs} | Step {self.current_step}/{self.total_steps}")

def create_deepspeed_config():
    """Create DeepSpeed config for faster training"""
    return {
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": 2,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True
            },
            "allgather_partitions": True,
            "allgather_bucket_size": 5e8,
            "reduce_scatter": True,
            "reduce_bucket_size": 5e8,
            "overlap_comm": True,
            "contiguous_gradients": True
        },
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": 2e-4,
                "betas": [0.9, 0.999],
                "eps": 1e-8,
                "weight_decay": 0.01
            }
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": 2e-4,
                "warmup_num_steps": 50
            }
        },
        "train_batch_size": 1,
        "train_micro_batch_size_per_gpu": 1,
        "gradient_accumulation_steps": 1,
        "gradient_clipping": 0.5,
        "steps_per_print": 10
    }

def finetune_model(qa_dataset, progress=gr.Progress(), status_box=None):
    """Fine-tune Phi-2 using optimized LoRA parameters"""
    global model, tokenizer, fine_tuned_model

    if model is None:
        return "Please load the base model first."

    if len(qa_dataset) == 0:
        return "No training data created. Please check your document."

    try:
        progress(0.1, desc="Preparing model for fine-tuning...")
        if status_box:
            status_box.update("Preparing model for fine-tuning...")

        # Clear GPU memory
        clear_gpu_memory()

        # Prepare model for 8-bit training if using GPU
        if DEVICE == "cuda":
            training_model = prepare_model_for_kbit_training(model)
        else:
            training_model = model

        # Add this line to fix the gradient error
        training_model.enable_input_require_grads()

        # Configure LoRA for Phi-2
        peft_config = LoraConfig(
            r=2,  # Reduced rank for lighter training
            lora_alpha=4,  # Reduced alpha
            lora_dropout=0.05,  # Added small dropout for regularization
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=LORA_TARGET_MODULES  # Fixed Phi-2 modules
        )

        # Apply LoRA to model
        lora_model = get_peft_model(training_model, peft_config)

        # Print trainable parameters
        trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in lora_model.parameters())
        print(f"Trainable parameters: {trainable_params:,} ({trainable_params/all_params:.2%} of {all_params:,} total)")

        # Enable gradient checkpointing for memory efficiency
        if hasattr(lora_model, "gradient_checkpointing_enable"):
            lora_model.gradient_checkpointing_enable()
            print("Gradient checkpointing enabled")

        # Create training arguments optimized for Phi-2
        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=2,  # Set to 2 as requested
            per_device_train_batch_size=1,
            gradient_accumulation_steps=1,
            learning_rate=1e-4,  # Reduced from 2e-4 for stability
            lr_scheduler_type="constant",  # Simplified scheduler
            warmup_ratio=0.05,  # Slight increase in warmup
            weight_decay=0.01,
            logging_steps=1,
            max_grad_norm=0.3,  # Reduced from 0.5 for better gradient stability
            save_strategy="no",
            report_to="none",
            remove_unused_columns=False,
            fp16=(DEVICE == "cuda"),
            no_cuda=(DEVICE == "cpu"),
            optim="adamw_torch",  # Use standard optimizer instead of fused for stability
            gradient_checkpointing=True
        )

        # Add DeepSpeed if on CUDA
        if DEVICE == "cuda":
            training_args.deepspeed = create_deepspeed_config()

        # Create data collator that doesn't move tensors to device yet
        def collate_fn(features):
            batch = {}
            for key in features[0].keys():
                if key in ["input_ids", "attention_mask", "labels"]:
                    batch[key] = torch.stack([f[key] for f in features])
            return batch

        progress(0.3, desc="Setting up trainer...")
        if status_box:
            status_box.update("Setting up trainer...")

        # Create trainer
        trainer = Trainer(
            model=lora_model,
            args=training_args,
            train_dataset=qa_dataset,
            data_collator=collate_fn,
            callbacks=[ProgressCallback(progress, status_box)]  # Add both callbacks
        )

        # Start training
        progress(0.4, desc="Initializing training...")
        if status_box:
            status_box.update("Initializing training...")
        print("Starting training...")
        trainer.train()

        # Set fine-tuned model
        fine_tuned_model = lora_model

        # Put model in evaluation mode
        fine_tuned_model.eval()

        # Clear memory
        clear_gpu_memory()

        return "Fine-tuning completed successfully! You can now ask questions about your document."

    except Exception as e:
        error_msg = f"Error during fine-tuning: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()

        # Try to clean up memory
        try:
            clear_gpu_memory()
        except:
            pass

        return error_msg

def process_document(file_obj, progress=gr.Progress(), status_box=None):
    """Process uploaded document and prepare dataset for fine-tuning"""
    global model, tokenizer, document_text

    progress(0, desc="Processing document...")
    if status_box:
        status_box.update("Processing document...")

    if not file_obj:
        return "Please upload a document first."

    try:
        # Create temp directory for file
        temp_dir = tempfile.mkdtemp()

        # Get file name
        file_name = getattr(file_obj, 'name', 'uploaded_file')
        if not isinstance(file_name, str):
            file_name = "uploaded_file.txt"  # Default name

        # Ensure file has extension
        if '.' not in file_name:
            file_name = file_name + '.txt'

        temp_path = os.path.join(temp_dir, file_name)

        # Get file content
        if hasattr(file_obj, 'read'):
            file_content = file_obj.read()
        else:
            file_content = file_obj

        with open(temp_path, 'wb') as f:
            f.write(file_content)

        # Extract text based on file extension
        file_extension = os.path.splitext(file_name)[1].lower()

        if file_extension == '.pdf':
            text = process_pdf(temp_path)
        elif file_extension in ['.docx', '.doc']:
            text = process_docx(temp_path)
        elif file_extension == '.txt' or True:  # Default to txt for unknown extensions
            text = process_txt(temp_path)

        # Check if text was extracted
        if not text or len(text) < 50:
            return "Could not extract sufficient text from the document. Please check the file."

        # Save document text for context window during inference
        document_text = text

        # Preprocess and chunk the document
        progress(0.3, desc="Preprocessing document...")
        if status_box:
            status_box.update("Preprocessing document...")
        text = preprocess_text(text)
        chunks = get_semantic_chunks(text)

        if not chunks:
            return "Could not extract meaningful text from the document."

        # Create enhanced QA pairs
        progress(0.5, desc="Creating QA dataset...")
        if status_box:
            status_box.update("Creating QA dataset...")
        qa_pairs = create_qa_dataset(chunks)

        print(f"Created {len(qa_pairs)} QA pairs for training")

        # Debug: Print a sample of QA pairs to verify format
        if qa_pairs:
            print("\nSample QA pair for validation:")
            sample = qa_pairs[0]
            print(f"Question: {sample['question']}")
            print(f"Context length: {len(sample['context'])} chars")
            print(f"Answer: {sample['answer'][:50]}...")

        # Create dataset
        qa_dataset = QADataset(qa_pairs, tokenizer, max_length=MAX_SEQ_LEN)

        # Fine-tune model
        progress(0.7, desc="Starting fine-tuning...")
        if status_box:
            status_box.update("Starting fine-tuning...")
        result = finetune_model(qa_dataset, progress, status_box)

        # Clean up
        try:
            os.remove(temp_path)
            os.rmdir(temp_dir)
        except:
            pass

        return result

    except Exception as e:
        error_msg = f"Error processing document: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()
        return error_msg

def generate_answer(question, status_box=None):
    """Generate answer using fine-tuned Phi-2 model with improved response quality"""
    global fine_tuned_model, tokenizer, document_text

    if fine_tuned_model is None:
        return "Please process a document first!"

    if not question.strip():
        return "Please enter a question."

    try:
        # Clear memory before generation
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # For better answers, use document context to help the model
        # Find relevant context from document (simple keyword matching for efficiency)
        keywords = re.findall(r'\b\w{5,}\b', question.lower())
        context = document_text

        # If document is very long, try to find relevant section
        if len(document_text) > 2000 and keywords:
            chunks = get_semantic_chunks(document_text, chunk_size=500, overlap=100)
            relevant_chunks = []

            for chunk in chunks:
                score = sum(1 for keyword in keywords if keyword.lower() in chunk.lower())
                if score > 0:
                    relevant_chunks.append((chunk, score))

            relevant_chunks.sort(key=lambda x: x[1], reverse=True)

            if relevant_chunks:
                # Use top 2 most relevant chunks
                context = " ".join([chunk for chunk, _ in relevant_chunks[:2]])

        # Limit context length to fit in model's context window
        context = context[:1500]  # Limit to 1500 chars for prompt space

        # Create Phi-2 optimized prompt
        prompt = phi2_prompt_template(context, question)

        # Ensure model is in evaluation mode
        fine_tuned_model.eval()

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device)

        # Configure generation parameters optimized for Phi-2
        with torch.no_grad():
            outputs = fine_tuned_model.generate(
                **inputs,
                max_new_tokens=75,  # Reduced from 150
                do_sample=True,
                temperature=0.7,
                top_k=40,
                top_p=0.85,
                repetition_penalty=1.2,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated answer part
        if "Answer:" in response:
            answer = response.split("Answer:")[-1].strip()
        else:
            answer = response

        # If answer is too short or generic, try again with more temperature
        if len(answer.split()) < 10 or "I don't have enough information" in answer:
            with torch.no_grad():
                outputs = fine_tuned_model.generate(
                    **inputs,
                    max_new_tokens=75,  # Reduced from 150
                    do_sample=True,
                    temperature=0.9,  # Higher temperature
                    top_k=40,
                    top_p=0.92,
                    repetition_penalty=1.2,
                    pad_token_id=tokenizer.pad_token_id
                )

            # Decode second attempt
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract answer
            if "Answer:" in response:
                answer = response.split("Answer:")[-1].strip()
            else:
                answer = response

        return answer

    except Exception as e:
        error_msg = f"Error generating answer: {str(e)}"
        print(error_msg)
        return error_msg

# Create Gradio interface
with gr.Blocks(title="Phi-2 Document QA", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 Phi-2 Document Q&A System")
    gr.Markdown("Specialized system for fine-tuning Microsoft's Phi-2 model on your documents")

    with gr.Tab("Document Processing"):
        file_input = gr.File(
            label="Upload Document (PDF, DOCX, or TXT)",
            file_types=[".pdf", ".docx", ".txt"],
            type="binary"
        )

        with gr.Row():
            load_model_btn = gr.Button("1. Load Phi-2 Model", variant="secondary")
            process_btn = gr.Button("2. Process & Fine-tune Document", variant="primary")

        status = gr.Textbox(
            label="Status",
            placeholder="First load the model, then upload a document and click 'Process & Fine-tune'",
            lines=3
        )

        gr.Markdown("""
        ### Tips for Best Results
        - PDF, DOCX and TXT files are supported
        - Keep documents under 10 pages for best results
        - Processing time depends on document length and GPU availability
        - For GPU usage in Colab: Runtime > Change runtime type > GPU
        """)

    with gr.Tab("Ask Questions"):
        question_input = gr.Textbox(
            label="Your Question",
            placeholder="Ask about your document...",
            lines=2
        )

        ask_btn = gr.Button("Get Answer", variant="primary")

        answer_output = gr.Textbox(
            label="Phi-2's Response",
            placeholder="The answer will appear here after you ask a question",
            lines=8
        )

        gr.Markdown("""
        ### Example Questions
        - "What is this document about?"
        - "Summarize the key points in this document"
        - "What does the document say about [specific topic]?"
        - "Explain the relationship between [concept A] and [concept B]"
        """)

    # Set up events
    load_model_btn.click(
        fn=load_base_model,
        outputs=[status]
    )

    process_btn.click(
        fn=process_document,
        inputs=[file_input],
        outputs=[status]
    )

    ask_btn.click(
        fn=generate_answer,
        inputs=[question_input],
        outputs=[answer_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)