import streamlit as st
import fitz  # PyMuPDF
import cv2
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import tempfile
import torch

# Initialize free OpenLLaMA model (no auth needed)
model_name = "openlm-research/open_llama_7b_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

def pdf_to_images(pdf_path):
    """Convert PDF to high-res images using PyMuPDF"""
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap(dpi=200)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

def highlight_differences(img1, img2):
    """Visual difference highlighting"""
    img1_np = np.array(img1)
    img2_np = np.array(img2)
    
    gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
    gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
    
    diff = cv2.absdiff(gray1, gray2)
    _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
    
    highlighted = img2_np.copy()
    highlighted[thresh == 255] = [255, 0, 0]  # Red highlights
    
    return Image.fromarray(highlighted), np.mean(diff)

def extract_text_with_layout(img):
    """Improved text extraction keeping layout"""
    import pytesseract
    custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
    return pytesseract.image_to_string(img, config=custom_config)

def generate_free_report(before_text, after_text, visual_desc):
    """Generate report using free OpenLLaMA model"""
    prompt = f"""
    Compare these document versions and provide a professional difference report:
    
    BEFORE VERSION:
    {before_text[:1500]}... [truncated]
    
    AFTER VERSION:
    {after_text[:1500]}... [truncated]
    
    VISUAL ANALYSIS NOTES:
    {visual_desc}
    
    Provide output in this format:
    1. SUMMARY: 2-3 sentence overview
    2. KEY CHANGES: Bullet points of specific changes
    3. ANALYSIS: Potential implications
    """
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    st.title("Free PDF Comparator")
    
    col1, col2 = st.columns(2)
    with col1:
        base_pdf = st.file_uploader("Original PDF", type=["pdf"])
    with col2:
        changed_pdf = st.file_uploader("Modified PDF", type=["pdf"])
    
    if st.button("Generate Report"):
        if base_pdf and changed_pdf:
            with st.spinner("Analyzing documents..."):
                with tempfile.TemporaryDirectory() as temp_dir:
                    # Save files
                    base_path = os.path.join(temp_dir, "base.pdf")
                    changed_path = os.path.join(temp_dir, "changed.pdf")
                    
                    with open(base_path, "wb") as f:
                        base_pdf.seek(0)
                        f.write(base_pdf.read())
                    with open(changed_path, "wb") as f:
                        changed_pdf.seek(0)
                        f.write(changed_pdf.read())
                    
                    # Process
                    base_images = pdf_to_images(base_path)
                    changed_images = pdf_to_images(changed_path)
                    
                    reports = []
                    for i, (img1, img2) in enumerate(zip(base_images, changed_images)):
                        # Visual diff
                        highlighted, diff_score = highlight_differences(img1, img2)
                        
                        if diff_score > 5:  # Threshold for meaningful changes
                            # Text extraction
                            before_text = extract_text_with_layout(img1)
                            after_text = extract_text_with_layout(img2)
                            
                            # Generate report
                            visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
                            with st.spinner(f"Analyzing page {i+1}..."):
                                report = generate_free_report(before_text, after_text, visual_desc)
                            
                            reports.append((i+1, report, highlighted))
                    
                    # Display results
                    if not reports:
                        st.success("No significant differences found!")
                    else:
                        st.subheader("Comparison Report")
                        for page_num, report, img in reports:
                            with st.expander(f"Page {page_num} Analysis"):
                                col1, col2 = st.columns([1, 2])
                                with col1:
                                    st.image(img, use_column_width=True)
                                with col2:
                                    st.markdown(f"**Page {page_num} Report**")
                                    st.write(report.split("ANALYSIS:")[-1])  # Show just the analysis part
        else:
            st.warning("Please upload both PDF files")

if __name__ == "__main__":
    main()