Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
import cv2 | |
import numpy as np | |
from PIL import Image | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import os | |
import tempfile | |
import torch | |
# Initialize free OpenLLaMA model (no auth needed) | |
model_name = "openlm-research/open_llama_7b_v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto" | |
) | |
def pdf_to_images(pdf_path): | |
"""Convert PDF to high-res images using PyMuPDF""" | |
doc = fitz.open(pdf_path) | |
images = [] | |
for page in doc: | |
pix = page.get_pixmap(dpi=200) | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
def highlight_differences(img1, img2): | |
"""Visual difference highlighting""" | |
img1_np = np.array(img1) | |
img2_np = np.array(img2) | |
gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY) | |
gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY) | |
diff = cv2.absdiff(gray1, gray2) | |
_, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY) | |
highlighted = img2_np.copy() | |
highlighted[thresh == 255] = [255, 0, 0] # Red highlights | |
return Image.fromarray(highlighted), np.mean(diff) | |
def extract_text_with_layout(img): | |
"""Improved text extraction keeping layout""" | |
import pytesseract | |
custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1' | |
return pytesseract.image_to_string(img, config=custom_config) | |
def generate_free_report(before_text, after_text, visual_desc): | |
"""Generate report using free OpenLLaMA model""" | |
prompt = f""" | |
Compare these document versions and provide a professional difference report: | |
BEFORE VERSION: | |
{before_text[:1500]}... [truncated] | |
AFTER VERSION: | |
{after_text[:1500]}... [truncated] | |
VISUAL ANALYSIS NOTES: | |
{visual_desc} | |
Provide output in this format: | |
1. SUMMARY: 2-3 sentence overview | |
2. KEY CHANGES: Bullet points of specific changes | |
3. ANALYSIS: Potential implications | |
""" | |
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=512, | |
temperature=0.7, | |
do_sample=True | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
def main(): | |
st.title("Free PDF Comparator") | |
col1, col2 = st.columns(2) | |
with col1: | |
base_pdf = st.file_uploader("Original PDF", type=["pdf"]) | |
with col2: | |
changed_pdf = st.file_uploader("Modified PDF", type=["pdf"]) | |
if st.button("Generate Report"): | |
if base_pdf and changed_pdf: | |
with st.spinner("Analyzing documents..."): | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Save files | |
base_path = os.path.join(temp_dir, "base.pdf") | |
changed_path = os.path.join(temp_dir, "changed.pdf") | |
with open(base_path, "wb") as f: | |
base_pdf.seek(0) | |
f.write(base_pdf.read()) | |
with open(changed_path, "wb") as f: | |
changed_pdf.seek(0) | |
f.write(changed_pdf.read()) | |
# Process | |
base_images = pdf_to_images(base_path) | |
changed_images = pdf_to_images(changed_path) | |
reports = [] | |
for i, (img1, img2) in enumerate(zip(base_images, changed_images)): | |
# Visual diff | |
highlighted, diff_score = highlight_differences(img1, img2) | |
if diff_score > 5: # Threshold for meaningful changes | |
# Text extraction | |
before_text = extract_text_with_layout(img1) | |
after_text = extract_text_with_layout(img2) | |
# Generate report | |
visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})" | |
with st.spinner(f"Analyzing page {i+1}..."): | |
report = generate_free_report(before_text, after_text, visual_desc) | |
reports.append((i+1, report, highlighted)) | |
# Display results | |
if not reports: | |
st.success("No significant differences found!") | |
else: | |
st.subheader("Comparison Report") | |
for page_num, report, img in reports: | |
with st.expander(f"Page {page_num} Analysis"): | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
st.image(img, use_column_width=True) | |
with col2: | |
st.markdown(f"**Page {page_num} Report**") | |
st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part | |
else: | |
st.warning("Please upload both PDF files") | |
if __name__ == "__main__": | |
main() |