Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,20 +18,36 @@ st.set_page_config(
|
|
18 |
# ========== CACHED DATA LOADING ==========
|
19 |
@st.cache_data(show_spinner=False)
|
20 |
def load_questions():
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
23 |
|
24 |
@st.cache_data(show_spinner=False)
|
25 |
def load_questions_short():
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# ========== UTILITY FUNCTIONS ==========
|
30 |
def extract_text_from_pdf(uploaded_file):
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def highlight_differences(text1, text2):
|
|
|
|
|
|
|
35 |
differ = difflib.Differ()
|
36 |
diff = list(differ.compare(text1.split(), text2.split()))
|
37 |
|
@@ -48,32 +64,50 @@ def highlight_differences(text1, text2):
|
|
48 |
return highlighted_text
|
49 |
|
50 |
def calculate_similarity(text1, text2):
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def load_contract(file):
|
|
|
|
|
|
|
57 |
ext = file.name.split('.')[-1].lower()
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# ========== MAIN APP ==========
|
73 |
def main():
|
74 |
questions = load_questions()
|
75 |
questions_short = load_questions_short()
|
76 |
|
|
|
|
|
|
|
|
|
77 |
st.title("📑 Contract Analysis Suite")
|
78 |
st.markdown("""
|
79 |
Compare documents and analyze legal clauses using AI-powered question answering.
|
@@ -90,8 +124,8 @@ def main():
|
|
90 |
key="file1"
|
91 |
)
|
92 |
contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
|
93 |
-
st.
|
94 |
-
|
95 |
with col2:
|
96 |
uploaded_file2 = st.file_uploader(
|
97 |
"Upload Second Document",
|
@@ -99,7 +133,19 @@ def main():
|
|
99 |
key="file2"
|
100 |
)
|
101 |
contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
|
102 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
if not (uploaded_file1 and uploaded_file2):
|
105 |
st.warning("Please upload both documents to proceed")
|
@@ -110,6 +156,10 @@ def main():
|
|
110 |
with st.expander("Show Document Differences", expanded=True):
|
111 |
if st.button("Compare Documents"):
|
112 |
with st.spinner("Analyzing documents..."):
|
|
|
|
|
|
|
|
|
113 |
similarity_score = calculate_similarity(contract_text1, contract_text2)
|
114 |
st.metric("Document Similarity Score", f"{similarity_score:.2f}%")
|
115 |
|
@@ -125,18 +175,23 @@ def main():
|
|
125 |
|
126 |
# ===== QUESTION ANALYSIS SECTION =====
|
127 |
st.header("3. Clause Analysis")
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
if st.button("Analyze Both Documents"):
|
138 |
-
if not (contract_text1 and contract_text2):
|
139 |
-
st.error("Please ensure both documents have content")
|
140 |
return
|
141 |
|
142 |
col1, col2 = st.columns(2)
|
@@ -144,16 +199,22 @@ def main():
|
|
144 |
with col1:
|
145 |
st.subheader("First Document Analysis")
|
146 |
with st.spinner('Processing first document...'):
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
150 |
|
151 |
with col2:
|
152 |
st.subheader("Second Document Analysis")
|
153 |
with st.spinner('Processing second document...'):
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
157 |
|
158 |
if __name__ == "__main__":
|
159 |
main()
|
|
|
18 |
# ========== CACHED DATA LOADING ==========
|
19 |
@st.cache_data(show_spinner=False)
|
20 |
def load_questions():
|
21 |
+
try:
|
22 |
+
with open('data/questions.txt') as f:
|
23 |
+
return [q.strip() for q in f.readlines() if q.strip()]
|
24 |
+
except Exception as e:
|
25 |
+
st.error(f"Error loading questions: {str(e)}")
|
26 |
+
return []
|
27 |
|
28 |
@st.cache_data(show_spinner=False)
|
29 |
def load_questions_short():
|
30 |
+
try:
|
31 |
+
with open('data/questions_short.txt') as f:
|
32 |
+
return [q.strip() for q in f.readlines() if q.strip()]
|
33 |
+
except Exception as e:
|
34 |
+
st.error(f"Error loading short questions: {str(e)}")
|
35 |
+
return []
|
36 |
|
37 |
# ========== UTILITY FUNCTIONS ==========
|
38 |
def extract_text_from_pdf(uploaded_file):
|
39 |
+
try:
|
40 |
+
with pdfplumber.open(uploaded_file) as pdf:
|
41 |
+
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
|
42 |
+
return text if text.strip() else ""
|
43 |
+
except Exception as e:
|
44 |
+
st.error(f"PDF extraction error: {str(e)}")
|
45 |
+
return ""
|
46 |
|
47 |
def highlight_differences(text1, text2):
|
48 |
+
if not text1 or not text2:
|
49 |
+
return ""
|
50 |
+
|
51 |
differ = difflib.Differ()
|
52 |
diff = list(differ.compare(text1.split(), text2.split()))
|
53 |
|
|
|
64 |
return highlighted_text
|
65 |
|
66 |
def calculate_similarity(text1, text2):
|
67 |
+
if not text1.strip() or not text2.strip():
|
68 |
+
return 0.0
|
69 |
+
|
70 |
+
try:
|
71 |
+
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
|
72 |
+
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
73 |
+
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
74 |
+
return similarity[0][0] * 100
|
75 |
+
except ValueError:
|
76 |
+
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
|
77 |
|
78 |
def load_contract(file):
|
79 |
+
if file is None:
|
80 |
+
return ""
|
81 |
+
|
82 |
ext = file.name.split('.')[-1].lower()
|
83 |
+
try:
|
84 |
+
if ext == 'txt':
|
85 |
+
content = StringIO(file.getvalue().decode("utf-8")).read()
|
86 |
+
elif ext == 'pdf':
|
87 |
+
content = extract_text_from_pdf(file)
|
88 |
+
if not content:
|
89 |
+
# Fallback to PyPDF4
|
90 |
+
pdfReader = PyPDF4.PdfFileReader(file)
|
91 |
+
content = '\n'.join([pdfReader.getPage(i).extractText() for i in range(pdfReader.numPages)])
|
92 |
+
elif ext == 'docx':
|
93 |
+
content = docx2txt.process(file)
|
94 |
+
else:
|
95 |
+
st.warning('Unsupported file type')
|
96 |
+
return ""
|
97 |
+
return content.strip() if content else ""
|
98 |
+
except Exception as e:
|
99 |
+
st.error(f"Error loading {ext.upper()} file: {str(e)}")
|
100 |
+
return ""
|
101 |
|
102 |
# ========== MAIN APP ==========
|
103 |
def main():
|
104 |
questions = load_questions()
|
105 |
questions_short = load_questions_short()
|
106 |
|
107 |
+
if not questions or not questions_short or len(questions) != len(questions_short):
|
108 |
+
st.error("Failed to load questions or questions mismatch. Please check data files.")
|
109 |
+
return
|
110 |
+
|
111 |
st.title("📑 Contract Analysis Suite")
|
112 |
st.markdown("""
|
113 |
Compare documents and analyze legal clauses using AI-powered question answering.
|
|
|
124 |
key="file1"
|
125 |
)
|
126 |
contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
|
127 |
+
doc1_display = st.empty()
|
128 |
+
|
129 |
with col2:
|
130 |
uploaded_file2 = st.file_uploader(
|
131 |
"Upload Second Document",
|
|
|
133 |
key="file2"
|
134 |
)
|
135 |
contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
|
136 |
+
doc2_display = st.empty()
|
137 |
+
|
138 |
+
# Update document displays
|
139 |
+
if uploaded_file1:
|
140 |
+
doc1_display.text_area("Document 1 Content",
|
141 |
+
value=contract_text1,
|
142 |
+
height=200,
|
143 |
+
key="area1")
|
144 |
+
if uploaded_file2:
|
145 |
+
doc2_display.text_area("Document 2 Content",
|
146 |
+
value=contract_text2,
|
147 |
+
height=200,
|
148 |
+
key="area2")
|
149 |
|
150 |
if not (uploaded_file1 and uploaded_file2):
|
151 |
st.warning("Please upload both documents to proceed")
|
|
|
156 |
with st.expander("Show Document Differences", expanded=True):
|
157 |
if st.button("Compare Documents"):
|
158 |
with st.spinner("Analyzing documents..."):
|
159 |
+
if not contract_text1.strip() or not contract_text2.strip():
|
160 |
+
st.error("One or both documents appear to be empty or couldn't be read properly")
|
161 |
+
return
|
162 |
+
|
163 |
similarity_score = calculate_similarity(contract_text1, contract_text2)
|
164 |
st.metric("Document Similarity Score", f"{similarity_score:.2f}%")
|
165 |
|
|
|
175 |
|
176 |
# ===== QUESTION ANALYSIS SECTION =====
|
177 |
st.header("3. Clause Analysis")
|
178 |
+
|
179 |
+
try:
|
180 |
+
question_selected = st.selectbox(
|
181 |
+
'Select a legal question to analyze:',
|
182 |
+
questions_short,
|
183 |
+
index=0,
|
184 |
+
key="question_select"
|
185 |
+
)
|
186 |
+
question_idx = questions_short.index(question_selected)
|
187 |
+
selected_question = questions[question_idx]
|
188 |
+
except Exception as e:
|
189 |
+
st.error(f"Error selecting question: {str(e)}")
|
190 |
+
return
|
191 |
|
192 |
if st.button("Analyze Both Documents"):
|
193 |
+
if not (contract_text1.strip() and contract_text2.strip()):
|
194 |
+
st.error("Please ensure both documents have readable content")
|
195 |
return
|
196 |
|
197 |
col1, col2 = st.columns(2)
|
|
|
199 |
with col1:
|
200 |
st.subheader("First Document Analysis")
|
201 |
with st.spinner('Processing first document...'):
|
202 |
+
try:
|
203 |
+
predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
|
204 |
+
answer1 = predictions1.get('0', 'No answer found')
|
205 |
+
st.success(answer1 if answer1 else "No relevant clause found")
|
206 |
+
except Exception as e:
|
207 |
+
st.error(f"Analysis failed for Document 1: {str(e)}")
|
208 |
|
209 |
with col2:
|
210 |
st.subheader("Second Document Analysis")
|
211 |
with st.spinner('Processing second document...'):
|
212 |
+
try:
|
213 |
+
predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
|
214 |
+
answer2 = predictions2.get('0', 'No answer found')
|
215 |
+
st.success(answer2 if answer2 else "No relevant clause found")
|
216 |
+
except Exception as e:
|
217 |
+
st.error(f"Analysis failed for Document 2: {str(e)}")
|
218 |
|
219 |
if __name__ == "__main__":
|
220 |
main()
|