File size: 4,922 Bytes
d9acf37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
from io import StringIO
import PyPDF4
import pdfplumber
import docx2txt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
from predict import run_prediction

# ========== CONFIG ==========
st.set_page_config(page_title="πŸ“‘ Contract Analyzer", layout="wide")

# ========== FUNCTIONS ==========
def extract_text_from_pdf(uploaded_file):
    try:
        with pdfplumber.open(uploaded_file) as pdf:
            return "\n".join(page.extract_text() or "" for page in pdf.pages)
    except:
        try:
            reader = PyPDF4.PdfFileReader(uploaded_file)
            return "\n".join([reader.getPage(i).extractText() for i in range(reader.numPages)])
        except Exception as e:
            st.error(f"Error reading PDF: {e}")
            return ""

def load_text(file):
    if not file:
        return ""
    try:
        ext = file.name.split('.')[-1].lower()
        if ext == 'txt':
            return StringIO(file.getvalue().decode("utf-8")).read()
        elif ext == 'pdf':
            return extract_text_from_pdf(file)
        elif ext == 'docx':
            return docx2txt.process(file)
        else:
            st.warning(f"Unsupported file type: {ext}")
            return ""
    except Exception as e:
        st.error(f"Error loading file: {e}")
        return ""

def highlight_diff(text1, text2):
    differ = difflib.Differ()
    diff = differ.compare(text1.split(), text2.split())
    html = ""
    for word in diff:
        if word.startswith("- "):
            html += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
        elif word.startswith("+ "):
            html += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
        else:
            html += word[2:] + " "
    return html

def compute_similarity(text1, text2):
    if not text1.strip() or not text2.strip():
        return 0.0
    try:
        tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
        tfidf_matrix = tfidf.fit_transform([text1, text2])
        sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        return sim[0][0] * 100
    except:
        return difflib.SequenceMatcher(None, text1, text2).ratio() * 100

# ========== MAIN ==========
def main():
    st.title("πŸ“‘ Contract Analyzer")
    st.markdown("Upload two contracts, compare them, and ask any question!")

    # Upload documents
    st.header("1. Upload Documents")
    col1, col2 = st.columns(2)
    with col1:
        file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
    with col2:
        file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")

    text1, text2 = "", ""
    if file1: text1 = load_text(file1)
    if file2: text2 = load_text(file2)

    if not (text1 and text2):
        st.warning("Please upload both documents to continue.")
        return

    # Display uploaded texts
    st.header("2. Documents Content")
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("First Document")
        st.text_area("Content of first document:", text1, height=300)
    with col2:
        st.subheader("Second Document")
        st.text_area("Content of second document:", text2, height=300)

    # Compare documents
    st.header("3. Compare Documents")
    if st.button("Compare Documents"):
        sim_score = compute_similarity(text1, text2)
        st.metric("Similarity Score", f"{sim_score:.2f}%")
        diff_html = highlight_diff(text1, text2)
        st.markdown("**Differences Highlighted:**", unsafe_allow_html=True)
        st.markdown(f"<div style='border:1px solid #ccc; padding:10px; max-height:400px; overflow:auto'>{diff_html}</div>", unsafe_allow_html=True)

    # Ask any question
    st.header("4. Ask a Question")
    user_question = st.text_input("Enter your question about the contracts:")

    if user_question and st.button("Analyze Question"):
        col1, col2 = st.columns(2)
        with col1:
            st.subheader("Answer from Document 1")
            with st.spinner("Analyzing..."):
                try:
                    pred1 = run_prediction([user_question], text1, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
                    st.success(pred1.get('0', 'No answer found'))
                except Exception as e:
                    st.error(f"Failed on Document 1: {e}")

        with col2:
            st.subheader("Answer from Document 2")
            with st.spinner("Analyzing..."):
                try:
                    pred2 = run_prediction([user_question], text2, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
                    st.success(pred2.get('0', 'No answer found'))
                except Exception as e:
                    st.error(f"Failed on Document 2: {e}")

if __name__ == "__main__":
    main()