Spaces:

ludigija
/

CUAD_contract

Sleeping

App Files Files Community

CUAD_contract / app.py

ludigija

Update app.py

e4fcaef verified about 1 month ago

raw

history blame contribute delete

5.41 kB

	import streamlit as st
	import fitz # PyMuPDF
	import cv2
	import numpy as np
	from PIL import Image
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import os
	import tempfile
	import torch

	# Initialize free OpenLLaMA model (no auth needed)
	model_name = "openlm-research/open_llama_7b_v2"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	def pdf_to_images(pdf_path):
	"""Convert PDF to high-res images using PyMuPDF"""
	doc = fitz.open(pdf_path)
	images = []
	for page in doc:
	pix = page.get_pixmap(dpi=200)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	return images

	def highlight_differences(img1, img2):
	"""Visual difference highlighting"""
	img1_np = np.array(img1)
	img2_np = np.array(img2)

	gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
	gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)

	diff = cv2.absdiff(gray1, gray2)
	_, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)

	highlighted = img2_np.copy()
	highlighted[thresh == 255] = [255, 0, 0] # Red highlights

	return Image.fromarray(highlighted), np.mean(diff)

	def extract_text_with_layout(img):
	"""Improved text extraction keeping layout"""
	import pytesseract
	custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
	return pytesseract.image_to_string(img, config=custom_config)

	def generate_free_report(before_text, after_text, visual_desc):
	"""Generate report using free OpenLLaMA model"""
	prompt = f"""
	Compare these document versions and provide a professional difference report:

	BEFORE VERSION:
	{before_text[:1500]}... [truncated]

	AFTER VERSION:
	{after_text[:1500]}... [truncated]

	VISUAL ANALYSIS NOTES:
	{visual_desc}

	Provide output in this format:
	1. SUMMARY: 2-3 sentence overview
	2. KEY CHANGES: Bullet points of specific changes
	3. ANALYSIS: Potential implications
	"""

	inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	do_sample=True
	)

	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	def main():
	st.title("Free PDF Comparator")

	col1, col2 = st.columns(2)
	with col1:
	base_pdf = st.file_uploader("Original PDF", type=["pdf"])
	with col2:
	changed_pdf = st.file_uploader("Modified PDF", type=["pdf"])

	if st.button("Generate Report"):
	if base_pdf and changed_pdf:
	with st.spinner("Analyzing documents..."):
	with tempfile.TemporaryDirectory() as temp_dir:
	# Save files
	base_path = os.path.join(temp_dir, "base.pdf")
	changed_path = os.path.join(temp_dir, "changed.pdf")

	with open(base_path, "wb") as f:
	base_pdf.seek(0)
	f.write(base_pdf.read())
	with open(changed_path, "wb") as f:
	changed_pdf.seek(0)
	f.write(changed_pdf.read())

	# Process
	base_images = pdf_to_images(base_path)
	changed_images = pdf_to_images(changed_path)

	reports = []
	for i, (img1, img2) in enumerate(zip(base_images, changed_images)):
	# Visual diff
	highlighted, diff_score = highlight_differences(img1, img2)

	if diff_score > 5: # Threshold for meaningful changes
	# Text extraction
	before_text = extract_text_with_layout(img1)
	after_text = extract_text_with_layout(img2)

	# Generate report
	visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
	with st.spinner(f"Analyzing page {i+1}..."):
	report = generate_free_report(before_text, after_text, visual_desc)

	reports.append((i+1, report, highlighted))

	# Display results
	if not reports:
	st.success("No significant differences found!")
	else:
	st.subheader("Comparison Report")
	for page_num, report, img in reports:
	with st.expander(f"Page {page_num} Analysis"):
	col1, col2 = st.columns([1, 2])
	with col1:
	st.image(img, use_column_width=True)
	with col2:
	st.markdown(f"Page {page_num} Report")
	st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part
	else:
	st.warning("Please upload both PDF files")

	if __name__ == "__main__":
	main()