Spaces:
Sleeping
Sleeping
File size: 5,411 Bytes
54c6347 10156fc 54c6347 10156fc e4fcaef 10156fc e4fcaef 10156fc e4fcaef 10156fc 66a7f5c 10156fc e4fcaef 10156fc e4fcaef 10156fc e4fcaef 10156fc e4fcaef 10156fc 66a7f5c 10156fc e4fcaef 10156fc e4fcaef 66a7f5c e4fcaef 10156fc 999ddda 10156fc 999ddda 10156fc 999ddda 10156fc e4fcaef 10156fc e4fcaef 54c6347 10156fc 54c6347 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import streamlit as st
import fitz # PyMuPDF
import cv2
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import tempfile
import torch
# Initialize free OpenLLaMA model (no auth needed)
model_name = "openlm-research/open_llama_7b_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
def pdf_to_images(pdf_path):
"""Convert PDF to high-res images using PyMuPDF"""
doc = fitz.open(pdf_path)
images = []
for page in doc:
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def highlight_differences(img1, img2):
"""Visual difference highlighting"""
img1_np = np.array(img1)
img2_np = np.array(img2)
gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
diff = cv2.absdiff(gray1, gray2)
_, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
highlighted = img2_np.copy()
highlighted[thresh == 255] = [255, 0, 0] # Red highlights
return Image.fromarray(highlighted), np.mean(diff)
def extract_text_with_layout(img):
"""Improved text extraction keeping layout"""
import pytesseract
custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
return pytesseract.image_to_string(img, config=custom_config)
def generate_free_report(before_text, after_text, visual_desc):
"""Generate report using free OpenLLaMA model"""
prompt = f"""
Compare these document versions and provide a professional difference report:
BEFORE VERSION:
{before_text[:1500]}... [truncated]
AFTER VERSION:
{after_text[:1500]}... [truncated]
VISUAL ANALYSIS NOTES:
{visual_desc}
Provide output in this format:
1. SUMMARY: 2-3 sentence overview
2. KEY CHANGES: Bullet points of specific changes
3. ANALYSIS: Potential implications
"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def main():
st.title("Free PDF Comparator")
col1, col2 = st.columns(2)
with col1:
base_pdf = st.file_uploader("Original PDF", type=["pdf"])
with col2:
changed_pdf = st.file_uploader("Modified PDF", type=["pdf"])
if st.button("Generate Report"):
if base_pdf and changed_pdf:
with st.spinner("Analyzing documents..."):
with tempfile.TemporaryDirectory() as temp_dir:
# Save files
base_path = os.path.join(temp_dir, "base.pdf")
changed_path = os.path.join(temp_dir, "changed.pdf")
with open(base_path, "wb") as f:
base_pdf.seek(0)
f.write(base_pdf.read())
with open(changed_path, "wb") as f:
changed_pdf.seek(0)
f.write(changed_pdf.read())
# Process
base_images = pdf_to_images(base_path)
changed_images = pdf_to_images(changed_path)
reports = []
for i, (img1, img2) in enumerate(zip(base_images, changed_images)):
# Visual diff
highlighted, diff_score = highlight_differences(img1, img2)
if diff_score > 5: # Threshold for meaningful changes
# Text extraction
before_text = extract_text_with_layout(img1)
after_text = extract_text_with_layout(img2)
# Generate report
visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
with st.spinner(f"Analyzing page {i+1}..."):
report = generate_free_report(before_text, after_text, visual_desc)
reports.append((i+1, report, highlighted))
# Display results
if not reports:
st.success("No significant differences found!")
else:
st.subheader("Comparison Report")
for page_num, report, img in reports:
with st.expander(f"Page {page_num} Analysis"):
col1, col2 = st.columns([1, 2])
with col1:
st.image(img, use_column_width=True)
with col2:
st.markdown(f"**Page {page_num} Report**")
st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part
else:
st.warning("Please upload both PDF files")
if __name__ == "__main__":
main() |