ludigija commited on
Commit
3260637
·
verified ·
1 Parent(s): 040feca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -42
app.py CHANGED
@@ -18,20 +18,36 @@ st.set_page_config(
18
  # ========== CACHED DATA LOADING ==========
19
  @st.cache_data(show_spinner=False)
20
  def load_questions():
21
- with open('data/questions.txt') as f:
22
- return f.readlines()
 
 
 
 
23
 
24
  @st.cache_data(show_spinner=False)
25
  def load_questions_short():
26
- with open('data/questions_short.txt') as f:
27
- return f.readlines()
 
 
 
 
28
 
29
  # ========== UTILITY FUNCTIONS ==========
30
  def extract_text_from_pdf(uploaded_file):
31
- with pdfplumber.open(uploaded_file) as pdf:
32
- return "\n".join(page.extract_text() or "" for page in pdf.pages)
 
 
 
 
 
33
 
34
  def highlight_differences(text1, text2):
 
 
 
35
  differ = difflib.Differ()
36
  diff = list(differ.compare(text1.split(), text2.split()))
37
 
@@ -48,32 +64,50 @@ def highlight_differences(text1, text2):
48
  return highlighted_text
49
 
50
  def calculate_similarity(text1, text2):
51
- vectorizer = TfidfVectorizer()
52
- tfidf_matrix = vectorizer.fit_transform([text1, text2])
53
- similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
54
- return similarity[0][0] * 100
 
 
 
 
 
 
55
 
56
  def load_contract(file):
 
 
 
57
  ext = file.name.split('.')[-1].lower()
58
- if ext == 'txt':
59
- return StringIO(file.getvalue().decode("utf-8")).read()
60
- elif ext == 'pdf':
61
- try:
62
- pdfReader = PyPDF4.PdfFileReader(file)
63
- return '\n'.join([pdfReader.getPage(i).extractText() for i in range(pdfReader.numPages)])
64
- except:
65
- st.warning('Unable to read PDF, please try another file')
66
- elif ext == 'docx':
67
- return docx2txt.process(file)
68
- else:
69
- st.warning('Unsupported file type')
70
- return ""
 
 
 
 
 
71
 
72
  # ========== MAIN APP ==========
73
  def main():
74
  questions = load_questions()
75
  questions_short = load_questions_short()
76
 
 
 
 
 
77
  st.title("📑 Contract Analysis Suite")
78
  st.markdown("""
79
  Compare documents and analyze legal clauses using AI-powered question answering.
@@ -90,8 +124,8 @@ def main():
90
  key="file1"
91
  )
92
  contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
93
- st.text_area("Document 1 Content", value=contract_text1, height=200, key="area1")
94
-
95
  with col2:
96
  uploaded_file2 = st.file_uploader(
97
  "Upload Second Document",
@@ -99,7 +133,19 @@ def main():
99
  key="file2"
100
  )
101
  contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
102
- st.text_area("Document 2 Content", value=contract_text2, height=200, key="area2")
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  if not (uploaded_file1 and uploaded_file2):
105
  st.warning("Please upload both documents to proceed")
@@ -110,6 +156,10 @@ def main():
110
  with st.expander("Show Document Differences", expanded=True):
111
  if st.button("Compare Documents"):
112
  with st.spinner("Analyzing documents..."):
 
 
 
 
113
  similarity_score = calculate_similarity(contract_text1, contract_text2)
114
  st.metric("Document Similarity Score", f"{similarity_score:.2f}%")
115
 
@@ -125,18 +175,23 @@ def main():
125
 
126
  # ===== QUESTION ANALYSIS SECTION =====
127
  st.header("3. Clause Analysis")
128
- question_selected = st.selectbox(
129
- 'Select a legal question to analyze:',
130
- questions_short,
131
- index=0,
132
- key="question_select"
133
- )
134
- question_idx = questions_short.index(question_selected)
135
- selected_question = questions[question_idx]
 
 
 
 
 
136
 
137
  if st.button("Analyze Both Documents"):
138
- if not (contract_text1 and contract_text2):
139
- st.error("Please ensure both documents have content")
140
  return
141
 
142
  col1, col2 = st.columns(2)
@@ -144,16 +199,22 @@ def main():
144
  with col1:
145
  st.subheader("First Document Analysis")
146
  with st.spinner('Processing first document...'):
147
- predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
148
- answer1 = predictions1.get('0', 'No answer found')
149
- st.success(answer1)
 
 
 
150
 
151
  with col2:
152
  st.subheader("Second Document Analysis")
153
  with st.spinner('Processing second document...'):
154
- predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
155
- answer2 = predictions2.get('0', 'No answer found')
156
- st.success(answer2)
 
 
 
157
 
158
  if __name__ == "__main__":
159
  main()
 
18
  # ========== CACHED DATA LOADING ==========
19
  @st.cache_data(show_spinner=False)
20
  def load_questions():
21
+ try:
22
+ with open('data/questions.txt') as f:
23
+ return [q.strip() for q in f.readlines() if q.strip()]
24
+ except Exception as e:
25
+ st.error(f"Error loading questions: {str(e)}")
26
+ return []
27
 
28
  @st.cache_data(show_spinner=False)
29
  def load_questions_short():
30
+ try:
31
+ with open('data/questions_short.txt') as f:
32
+ return [q.strip() for q in f.readlines() if q.strip()]
33
+ except Exception as e:
34
+ st.error(f"Error loading short questions: {str(e)}")
35
+ return []
36
 
37
  # ========== UTILITY FUNCTIONS ==========
38
  def extract_text_from_pdf(uploaded_file):
39
+ try:
40
+ with pdfplumber.open(uploaded_file) as pdf:
41
+ text = "\n".join(page.extract_text() or "" for page in pdf.pages)
42
+ return text if text.strip() else ""
43
+ except Exception as e:
44
+ st.error(f"PDF extraction error: {str(e)}")
45
+ return ""
46
 
47
  def highlight_differences(text1, text2):
48
+ if not text1 or not text2:
49
+ return ""
50
+
51
  differ = difflib.Differ()
52
  diff = list(differ.compare(text1.split(), text2.split()))
53
 
 
64
  return highlighted_text
65
 
66
  def calculate_similarity(text1, text2):
67
+ if not text1.strip() or not text2.strip():
68
+ return 0.0
69
+
70
+ try:
71
+ vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
72
+ tfidf_matrix = vectorizer.fit_transform([text1, text2])
73
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
74
+ return similarity[0][0] * 100
75
+ except ValueError:
76
+ return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
77
 
78
  def load_contract(file):
79
+ if file is None:
80
+ return ""
81
+
82
  ext = file.name.split('.')[-1].lower()
83
+ try:
84
+ if ext == 'txt':
85
+ content = StringIO(file.getvalue().decode("utf-8")).read()
86
+ elif ext == 'pdf':
87
+ content = extract_text_from_pdf(file)
88
+ if not content:
89
+ # Fallback to PyPDF4
90
+ pdfReader = PyPDF4.PdfFileReader(file)
91
+ content = '\n'.join([pdfReader.getPage(i).extractText() for i in range(pdfReader.numPages)])
92
+ elif ext == 'docx':
93
+ content = docx2txt.process(file)
94
+ else:
95
+ st.warning('Unsupported file type')
96
+ return ""
97
+ return content.strip() if content else ""
98
+ except Exception as e:
99
+ st.error(f"Error loading {ext.upper()} file: {str(e)}")
100
+ return ""
101
 
102
  # ========== MAIN APP ==========
103
  def main():
104
  questions = load_questions()
105
  questions_short = load_questions_short()
106
 
107
+ if not questions or not questions_short or len(questions) != len(questions_short):
108
+ st.error("Failed to load questions or questions mismatch. Please check data files.")
109
+ return
110
+
111
  st.title("📑 Contract Analysis Suite")
112
  st.markdown("""
113
  Compare documents and analyze legal clauses using AI-powered question answering.
 
124
  key="file1"
125
  )
126
  contract_text1 = load_contract(uploaded_file1) if uploaded_file1 else ""
127
+ doc1_display = st.empty()
128
+
129
  with col2:
130
  uploaded_file2 = st.file_uploader(
131
  "Upload Second Document",
 
133
  key="file2"
134
  )
135
  contract_text2 = load_contract(uploaded_file2) if uploaded_file2 else ""
136
+ doc2_display = st.empty()
137
+
138
+ # Update document displays
139
+ if uploaded_file1:
140
+ doc1_display.text_area("Document 1 Content",
141
+ value=contract_text1,
142
+ height=200,
143
+ key="area1")
144
+ if uploaded_file2:
145
+ doc2_display.text_area("Document 2 Content",
146
+ value=contract_text2,
147
+ height=200,
148
+ key="area2")
149
 
150
  if not (uploaded_file1 and uploaded_file2):
151
  st.warning("Please upload both documents to proceed")
 
156
  with st.expander("Show Document Differences", expanded=True):
157
  if st.button("Compare Documents"):
158
  with st.spinner("Analyzing documents..."):
159
+ if not contract_text1.strip() or not contract_text2.strip():
160
+ st.error("One or both documents appear to be empty or couldn't be read properly")
161
+ return
162
+
163
  similarity_score = calculate_similarity(contract_text1, contract_text2)
164
  st.metric("Document Similarity Score", f"{similarity_score:.2f}%")
165
 
 
175
 
176
  # ===== QUESTION ANALYSIS SECTION =====
177
  st.header("3. Clause Analysis")
178
+
179
+ try:
180
+ question_selected = st.selectbox(
181
+ 'Select a legal question to analyze:',
182
+ questions_short,
183
+ index=0,
184
+ key="question_select"
185
+ )
186
+ question_idx = questions_short.index(question_selected)
187
+ selected_question = questions[question_idx]
188
+ except Exception as e:
189
+ st.error(f"Error selecting question: {str(e)}")
190
+ return
191
 
192
  if st.button("Analyze Both Documents"):
193
+ if not (contract_text1.strip() and contract_text2.strip()):
194
+ st.error("Please ensure both documents have readable content")
195
  return
196
 
197
  col1, col2 = st.columns(2)
 
199
  with col1:
200
  st.subheader("First Document Analysis")
201
  with st.spinner('Processing first document...'):
202
+ try:
203
+ predictions1 = run_prediction([selected_question], contract_text1, 'marshmellow77/roberta-base-cuad', n_best_size=5)
204
+ answer1 = predictions1.get('0', 'No answer found')
205
+ st.success(answer1 if answer1 else "No relevant clause found")
206
+ except Exception as e:
207
+ st.error(f"Analysis failed for Document 1: {str(e)}")
208
 
209
  with col2:
210
  st.subheader("Second Document Analysis")
211
  with st.spinner('Processing second document...'):
212
+ try:
213
+ predictions2 = run_prediction([selected_question], contract_text2, 'marshmellow77/roberta-base-cuad', n_best_size=5)
214
+ answer2 = predictions2.get('0', 'No answer found')
215
+ st.success(answer2 if answer2 else "No relevant clause found")
216
+ except Exception as e:
217
+ st.error(f"Analysis failed for Document 2: {str(e)}")
218
 
219
  if __name__ == "__main__":
220
  main()