import streamlit as st import fitz # PyMuPDF import cv2 import numpy as np from PIL import Image from transformers import AutoTokenizer, AutoModelForCausalLM import os import tempfile import torch # Initialize free OpenLLaMA model (no auth needed) model_name = "openlm-research/open_llama_7b_v2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) def pdf_to_images(pdf_path): """Convert PDF to high-res images using PyMuPDF""" doc = fitz.open(pdf_path) images = [] for page in doc: pix = page.get_pixmap(dpi=200) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) return images def highlight_differences(img1, img2): """Visual difference highlighting""" img1_np = np.array(img1) img2_np = np.array(img2) gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY) gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY) diff = cv2.absdiff(gray1, gray2) _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY) highlighted = img2_np.copy() highlighted[thresh == 255] = [255, 0, 0] # Red highlights return Image.fromarray(highlighted), np.mean(diff) def extract_text_with_layout(img): """Improved text extraction keeping layout""" import pytesseract custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1' return pytesseract.image_to_string(img, config=custom_config) def generate_free_report(before_text, after_text, visual_desc): """Generate report using free OpenLLaMA model""" prompt = f""" Compare these document versions and provide a professional difference report: BEFORE VERSION: {before_text[:1500]}... [truncated] AFTER VERSION: {after_text[:1500]}... [truncated] VISUAL ANALYSIS NOTES: {visual_desc} Provide output in this format: 1. SUMMARY: 2-3 sentence overview 2. KEY CHANGES: Bullet points of specific changes 3. ANALYSIS: Potential implications """ inputs = tokenizer(prompt, return_tensors="pt").to("cuda") outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, do_sample=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True) def main(): st.title("Free PDF Comparator") col1, col2 = st.columns(2) with col1: base_pdf = st.file_uploader("Original PDF", type=["pdf"]) with col2: changed_pdf = st.file_uploader("Modified PDF", type=["pdf"]) if st.button("Generate Report"): if base_pdf and changed_pdf: with st.spinner("Analyzing documents..."): with tempfile.TemporaryDirectory() as temp_dir: # Save files base_path = os.path.join(temp_dir, "base.pdf") changed_path = os.path.join(temp_dir, "changed.pdf") with open(base_path, "wb") as f: base_pdf.seek(0) f.write(base_pdf.read()) with open(changed_path, "wb") as f: changed_pdf.seek(0) f.write(changed_pdf.read()) # Process base_images = pdf_to_images(base_path) changed_images = pdf_to_images(changed_path) reports = [] for i, (img1, img2) in enumerate(zip(base_images, changed_images)): # Visual diff highlighted, diff_score = highlight_differences(img1, img2) if diff_score > 5: # Threshold for meaningful changes # Text extraction before_text = extract_text_with_layout(img1) after_text = extract_text_with_layout(img2) # Generate report visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})" with st.spinner(f"Analyzing page {i+1}..."): report = generate_free_report(before_text, after_text, visual_desc) reports.append((i+1, report, highlighted)) # Display results if not reports: st.success("No significant differences found!") else: st.subheader("Comparison Report") for page_num, report, img in reports: with st.expander(f"Page {page_num} Analysis"): col1, col2 = st.columns([1, 2]) with col1: st.image(img, use_column_width=True) with col2: st.markdown(f"**Page {page_num} Report**") st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part else: st.warning("Please upload both PDF files") if __name__ == "__main__": main()