compare-docs / app.py
Abhilashvj's picture
Update app.py
06de1fa
raw
history blame contribute delete
2.5 kB
import streamlit as st
from docx import Document
import PyPDF2
import pdfplumber
import pytesseract
import difflib
import base64
def read_pdf(file):
try:
pdf_reader = PyPDF2.PdfFileReader(file)
total_pages = pdf_reader.numPages
text = []
for page_num in range(total_pages):
page = pdf_reader.getPage(page_num)
text.append(page.extract_text())
return "\n".join(text)
except:
st.warning('Failed to directly read PDF, trying OCR...')
try:
with pdfplumber.open(file) as pdf:
text = "\n".join([page.extract_text() for page in pdf.pages])
return text
except Exception as e:
st.error(f"Error in OCR: {str(e)}")
return None
def read_docx(file):
doc = Document(file)
return "\n".join([p.text for p in doc.paragraphs])
def compare_texts(text1, text2):
d = difflib.Differ()
diff = list(d.compare(text1.splitlines(), text2.splitlines()))
result = []
page_no = 1
for line in diff:
if 'Page' in line: # if a new page starts
page_no += 1
elif line.startswith('+ '): # text present in text2 but not in text1
result.append(f'Additional text detected on page {page_no}')
elif line.startswith('- '): # text present in text1 but not in text2
result.append(f'Less text detected on page {page_no}')
return "\n".join(set(result)) # using set to remove duplicates
st.title('PDF and DOCX Comparison Tool')
pdf_file = st.file_uploader('Upload a PDF file', type=['pdf'])
docx_file = st.file_uploader('Upload a DOCX file', type=['docx'])
if pdf_file and docx_file:
pdf_text = read_pdf(pdf_file)
docx_text = read_docx(docx_file)
b64_pdf = base64.b64encode(pdf_file.read()).decode() # some strings <-> bytes conversions necessary here
href = f'<a href="data:file/pdf;base64,{b64_pdf}" download="file.pdf">Download PDF File</a> (right-click and save as &lt;some_name&gt;.pdf)'
st.markdown(href, unsafe_allow_html=True)
st.markdown(f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="50%" height="600px" type="application/pdf"></iframe>', unsafe_allow_html=True)
st.markdown("### DOCX Content:")
st.text(docx_text)
if pdf_text and docx_text:
comparison_result = compare_texts(pdf_text, docx_text)
st.text(comparison_result)
else:
st.error('Failed to read text from one or both files.')