CUAD_contract / app.py
ludigija's picture
Update app.py
e4fcaef verified
import streamlit as st
import fitz # PyMuPDF
import cv2
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import tempfile
import torch
# Initialize free OpenLLaMA model (no auth needed)
model_name = "openlm-research/open_llama_7b_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
def pdf_to_images(pdf_path):
"""Convert PDF to high-res images using PyMuPDF"""
doc = fitz.open(pdf_path)
images = []
for page in doc:
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def highlight_differences(img1, img2):
"""Visual difference highlighting"""
img1_np = np.array(img1)
img2_np = np.array(img2)
gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
diff = cv2.absdiff(gray1, gray2)
_, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
highlighted = img2_np.copy()
highlighted[thresh == 255] = [255, 0, 0] # Red highlights
return Image.fromarray(highlighted), np.mean(diff)
def extract_text_with_layout(img):
"""Improved text extraction keeping layout"""
import pytesseract
custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
return pytesseract.image_to_string(img, config=custom_config)
def generate_free_report(before_text, after_text, visual_desc):
"""Generate report using free OpenLLaMA model"""
prompt = f"""
Compare these document versions and provide a professional difference report:
BEFORE VERSION:
{before_text[:1500]}... [truncated]
AFTER VERSION:
{after_text[:1500]}... [truncated]
VISUAL ANALYSIS NOTES:
{visual_desc}
Provide output in this format:
1. SUMMARY: 2-3 sentence overview
2. KEY CHANGES: Bullet points of specific changes
3. ANALYSIS: Potential implications
"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def main():
st.title("Free PDF Comparator")
col1, col2 = st.columns(2)
with col1:
base_pdf = st.file_uploader("Original PDF", type=["pdf"])
with col2:
changed_pdf = st.file_uploader("Modified PDF", type=["pdf"])
if st.button("Generate Report"):
if base_pdf and changed_pdf:
with st.spinner("Analyzing documents..."):
with tempfile.TemporaryDirectory() as temp_dir:
# Save files
base_path = os.path.join(temp_dir, "base.pdf")
changed_path = os.path.join(temp_dir, "changed.pdf")
with open(base_path, "wb") as f:
base_pdf.seek(0)
f.write(base_pdf.read())
with open(changed_path, "wb") as f:
changed_pdf.seek(0)
f.write(changed_pdf.read())
# Process
base_images = pdf_to_images(base_path)
changed_images = pdf_to_images(changed_path)
reports = []
for i, (img1, img2) in enumerate(zip(base_images, changed_images)):
# Visual diff
highlighted, diff_score = highlight_differences(img1, img2)
if diff_score > 5: # Threshold for meaningful changes
# Text extraction
before_text = extract_text_with_layout(img1)
after_text = extract_text_with_layout(img2)
# Generate report
visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
with st.spinner(f"Analyzing page {i+1}..."):
report = generate_free_report(before_text, after_text, visual_desc)
reports.append((i+1, report, highlighted))
# Display results
if not reports:
st.success("No significant differences found!")
else:
st.subheader("Comparison Report")
for page_num, report, img in reports:
with st.expander(f"Page {page_num} Analysis"):
col1, col2 = st.columns([1, 2])
with col1:
st.image(img, use_column_width=True)
with col2:
st.markdown(f"**Page {page_num} Report**")
st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part
else:
st.warning("Please upload both PDF files")
if __name__ == "__main__":
main()