Spaces:

gnaw05
/

compare_docs

Sleeping

App Files Files Community

gnaw05 commited on 4 days ago

Commit

d9acf37

1 Parent(s): 355bf63

init

Browse files

Files changed (3) hide show

app.py +134 -0
predict.py +24 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+from io import StringIO
+import PyPDF4
+import pdfplumber
+import docx2txt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import difflib
+from predict import run_prediction
+# ========== CONFIG ==========
+st.set_page_config(page_title="📑 Contract Analyzer", layout="wide")
+# ========== FUNCTIONS ==========
+def extract_text_from_pdf(uploaded_file):
+    try:
+        with pdfplumber.open(uploaded_file) as pdf:
+            return "\n".join(page.extract_text() or "" for page in pdf.pages)
+    except:
+        try:
+            reader = PyPDF4.PdfFileReader(uploaded_file)
+            return "\n".join([reader.getPage(i).extractText() for i in range(reader.numPages)])
+        except Exception as e:
+            st.error(f"Error reading PDF: {e}")
+            return ""
+def load_text(file):
+    if not file:
+        return ""
+    try:
+        ext = file.name.split('.')[-1].lower()
+        if ext == 'txt':
+            return StringIO(file.getvalue().decode("utf-8")).read()
+        elif ext == 'pdf':
+            return extract_text_from_pdf(file)
+        elif ext == 'docx':
+            return docx2txt.process(file)
+        else:
+            st.warning(f"Unsupported file type: {ext}")
+            return ""
+    except Exception as e:
+        st.error(f"Error loading file: {e}")
+        return ""
+def highlight_diff(text1, text2):
+    differ = difflib.Differ()
+    diff = differ.compare(text1.split(), text2.split())
+    html = ""
+    for word in diff:
+        if word.startswith("- "):
+            html += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
+        elif word.startswith("+ "):
+            html += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
+        else:
+            html += word[2:] + " "
+    return html
+def compute_similarity(text1, text2):
+    if not text1.strip() or not text2.strip():
+        return 0.0
+    try:
+        tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
+        tfidf_matrix = tfidf.fit_transform([text1, text2])
+        sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
+        return sim[0][0] * 100
+    except:
+        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
+# ========== MAIN ==========
+def main():
+    st.title("📑 Contract Analyzer")
+    st.markdown("Upload two contracts, compare them, and ask any question!")
+    # Upload documents
+    st.header("1. Upload Documents")
+    col1, col2 = st.columns(2)
+    with col1:
+        file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
+    with col2:
+        file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
+    text1, text2 = "", ""
+    if file1: text1 = load_text(file1)
+    if file2: text2 = load_text(file2)
+    if not (text1 and text2):
+        st.warning("Please upload both documents to continue.")
+        return
+    # Display uploaded texts
+    st.header("2. Documents Content")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("First Document")
+        st.text_area("Content of first document:", text1, height=300)
+    with col2:
+        st.subheader("Second Document")
+        st.text_area("Content of second document:", text2, height=300)
+    # Compare documents
+    st.header("3. Compare Documents")
+    if st.button("Compare Documents"):
+        sim_score = compute_similarity(text1, text2)
+        st.metric("Similarity Score", f"{sim_score:.2f}%")
+        diff_html = highlight_diff(text1, text2)
+        st.markdown("**Differences Highlighted:**", unsafe_allow_html=True)
+        st.markdown(f"<div style='border:1px solid #ccc; padding:10px; max-height:400px; overflow:auto'>{diff_html}</div>", unsafe_allow_html=True)
+    # Ask any question
+    st.header("4. Ask a Question")
+    user_question = st.text_input("Enter your question about the contracts:")
+    if user_question and st.button("Analyze Question"):
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Answer from Document 1")
+            with st.spinner("Analyzing..."):
+                try:
+                    pred1 = run_prediction([user_question], text1, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
+                    st.success(pred1.get('0', 'No answer found'))
+                except Exception as e:
+                    st.error(f"Failed on Document 1: {e}")
+        with col2:
+            st.subheader("Answer from Document 2")
+            with st.spinner("Analyzing..."):
+                try:
+                    pred2 = run_prediction([user_question], text2, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
+                    st.success(pred2.get('0', 'No answer found'))
+                except Exception as e:
+                    st.error(f"Failed on Document 2: {e}")
+if __name__ == "__main__":
+    main()

predict.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import pipeline
+# Tải model sẵn để khỏi load nhiều lần
+qa_pipeline = pipeline(
+    "question-answering",
+    model="marshmellow77/roberta-base-cuad",
+    tokenizer="marshmellow77/roberta-base-cuad"
+)
+def run_prediction(questions, context, model_name=None, n_best_size=5):
+    """
+    - questions: list các câu hỏi (ví dụ ['What is the payment term?'])
+    - context: đoạn văn bản (hợp đồng) để tìm câu trả lời
+    - model_name: không cần, để giữ nguyên cho tương thích
+    - n_best_size: không cần, giữ nguyên để gọi
+    """
+    answers = {}
+    for idx, question in enumerate(questions):
+        result = qa_pipeline({
+            'context': context,
+            'question': question
+        })
+        answers[str(idx)] = result['answer']
+    return answers

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+scikit-learn
+pdfplumber
+PyPDF4
+docx2txt
+transformers
+torch