Spaces:

Abhilashvj
/

compare-docs

Sleeping

App Files Files Community

compare-docs / app.py

Abhilashvj

Update app.py

06de1fa almost 2 years ago

raw

history blame contribute delete

2.5 kB

	import streamlit as st
	from docx import Document
	import PyPDF2
	import pdfplumber
	import pytesseract
	import difflib
	import base64

	def read_pdf(file):
	try:
	pdf_reader = PyPDF2.PdfFileReader(file)
	total_pages = pdf_reader.numPages
	text = []
	for page_num in range(total_pages):
	page = pdf_reader.getPage(page_num)
	text.append(page.extract_text())
	return "\n".join(text)
	except:
	st.warning('Failed to directly read PDF, trying OCR...')
	try:
	with pdfplumber.open(file) as pdf:
	text = "\n".join([page.extract_text() for page in pdf.pages])
	return text
	except Exception as e:
	st.error(f"Error in OCR: {str(e)}")
	return None

	def read_docx(file):
	doc = Document(file)
	return "\n".join([p.text for p in doc.paragraphs])

	def compare_texts(text1, text2):
	d = difflib.Differ()
	diff = list(d.compare(text1.splitlines(), text2.splitlines()))

	result = []
	page_no = 1
	for line in diff:
	if 'Page' in line: # if a new page starts
	page_no += 1
	elif line.startswith('+ '): # text present in text2 but not in text1
	result.append(f'Additional text detected on page {page_no}')
	elif line.startswith('- '): # text present in text1 but not in text2
	result.append(f'Less text detected on page {page_no}')

	return "\n".join(set(result)) # using set to remove duplicates

	st.title('PDF and DOCX Comparison Tool')

	pdf_file = st.file_uploader('Upload a PDF file', type=['pdf'])
	docx_file = st.file_uploader('Upload a DOCX file', type=['docx'])

	if pdf_file and docx_file:
	pdf_text = read_pdf(pdf_file)
	docx_text = read_docx(docx_file)

	b64_pdf = base64.b64encode(pdf_file.read()).decode() # some strings <-> bytes conversions necessary here
	href = f'<a href="data:file/pdf;base64,{b64_pdf}" download="file.pdf">Download PDF File</a> (right-click and save as <some_name>.pdf)'
	st.markdown(href, unsafe_allow_html=True)

	st.markdown(f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="50%" height="600px" type="application/pdf"></iframe>', unsafe_allow_html=True)
	st.markdown("### DOCX Content:")
	st.text(docx_text)

	if pdf_text and docx_text:
	comparison_result = compare_texts(pdf_text, docx_text)
	st.text(comparison_result)
	else:
	st.error('Failed to read text from one or both files.')