Spaces:

ludigija
/

CUAD_contract

Sleeping

File size: 5,411 Bytes

54c6347
10156fc
54c6347
 
10156fc
e4fcaef
10156fc
 
e4fcaef
10156fc
e4fcaef
 
 
 
 
 
 
 
10156fc
 
 
 
 
 
 
 
 
66a7f5c
 
10156fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4fcaef
 
10156fc
e4fcaef
10156fc
 
e4fcaef
10156fc
 
e4fcaef
10156fc
 
 
 
 
 
 
 
66a7f5c
10156fc
e4fcaef
 
 
 
10156fc
 
 
 
e4fcaef
66a7f5c
 
e4fcaef
10156fc
 
 
 
 
 
 
 
 
 
 
 
 
 
999ddda
10156fc
 
 
 
 
 
999ddda
10156fc
 
 
 
 
 
 
 
999ddda
10156fc
 
 
 
 
 
e4fcaef
 
 
10156fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4fcaef
54c6347
10156fc
54c6347

import streamlit as st
import fitz  # PyMuPDF
import cv2
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import tempfile
import torch

# Initialize free OpenLLaMA model (no auth needed)
model_name = "openlm-research/open_llama_7b_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

def pdf_to_images(pdf_path):
    """Convert PDF to high-res images using PyMuPDF"""
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap(dpi=200)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

def highlight_differences(img1, img2):
    """Visual difference highlighting"""
    img1_np = np.array(img1)
    img2_np = np.array(img2)
    
    gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
    gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
    
    diff = cv2.absdiff(gray1, gray2)
    _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
    
    highlighted = img2_np.copy()
    highlighted[thresh == 255] = [255, 0, 0]  # Red highlights
    
    return Image.fromarray(highlighted), np.mean(diff)

def extract_text_with_layout(img):
    """Improved text extraction keeping layout"""
    import pytesseract
    custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
    return pytesseract.image_to_string(img, config=custom_config)

def generate_free_report(before_text, after_text, visual_desc):
    """Generate report using free OpenLLaMA model"""
    prompt = f"""
    Compare these document versions and provide a professional difference report:
    
    BEFORE VERSION:
    {before_text[:1500]}... [truncated]
    
    AFTER VERSION:
    {after_text[:1500]}... [truncated]
    
    VISUAL ANALYSIS NOTES:
    {visual_desc}
    
    Provide output in this format:
    1. SUMMARY: 2-3 sentence overview
    2. KEY CHANGES: Bullet points of specific changes
    3. ANALYSIS: Potential implications
    """
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    st.title("Free PDF Comparator")
    
    col1, col2 = st.columns(2)
    with col1:
        base_pdf = st.file_uploader("Original PDF", type=["pdf"])
    with col2:
        changed_pdf = st.file_uploader("Modified PDF", type=["pdf"])
    
    if st.button("Generate Report"):
        if base_pdf and changed_pdf:
            with st.spinner("Analyzing documents..."):
                with tempfile.TemporaryDirectory() as temp_dir:
                    # Save files
                    base_path = os.path.join(temp_dir, "base.pdf")
                    changed_path = os.path.join(temp_dir, "changed.pdf")
                    
                    with open(base_path, "wb") as f:
                        base_pdf.seek(0)
                        f.write(base_pdf.read())
                    with open(changed_path, "wb") as f:
                        changed_pdf.seek(0)
                        f.write(changed_pdf.read())
                    
                    # Process
                    base_images = pdf_to_images(base_path)
                    changed_images = pdf_to_images(changed_path)
                    
                    reports = []
                    for i, (img1, img2) in enumerate(zip(base_images, changed_images)):
                        # Visual diff
                        highlighted, diff_score = highlight_differences(img1, img2)
                        
                        if diff_score > 5:  # Threshold for meaningful changes
                            # Text extraction
                            before_text = extract_text_with_layout(img1)
                            after_text = extract_text_with_layout(img2)
                            
                            # Generate report
                            visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
                            with st.spinner(f"Analyzing page {i+1}..."):
                                report = generate_free_report(before_text, after_text, visual_desc)
                            
                            reports.append((i+1, report, highlighted))
                    
                    # Display results
                    if not reports:
                        st.success("No significant differences found!")
                    else:
                        st.subheader("Comparison Report")
                        for page_num, report, img in reports:
                            with st.expander(f"Page {page_num} Analysis"):
                                col1, col2 = st.columns([1, 2])
                                with col1:
                                    st.image(img, use_column_width=True)
                                with col2:
                                    st.markdown(f"**Page {page_num} Report**")
                                    st.write(report.split("ANALYSIS:")[-1])  # Show just the analysis part
        else:
            st.warning("Please upload both PDF files")

if __name__ == "__main__":
    main()