gnaw05 commited on
Commit
d9acf37
·
1 Parent(s): 355bf63
Files changed (3) hide show
  1. app.py +134 -0
  2. predict.py +24 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import StringIO
3
+ import PyPDF4
4
+ import pdfplumber
5
+ import docx2txt
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import difflib
9
+ from predict import run_prediction
10
+
11
+ # ========== CONFIG ==========
12
+ st.set_page_config(page_title="📑 Contract Analyzer", layout="wide")
13
+
14
+ # ========== FUNCTIONS ==========
15
+ def extract_text_from_pdf(uploaded_file):
16
+ try:
17
+ with pdfplumber.open(uploaded_file) as pdf:
18
+ return "\n".join(page.extract_text() or "" for page in pdf.pages)
19
+ except:
20
+ try:
21
+ reader = PyPDF4.PdfFileReader(uploaded_file)
22
+ return "\n".join([reader.getPage(i).extractText() for i in range(reader.numPages)])
23
+ except Exception as e:
24
+ st.error(f"Error reading PDF: {e}")
25
+ return ""
26
+
27
+ def load_text(file):
28
+ if not file:
29
+ return ""
30
+ try:
31
+ ext = file.name.split('.')[-1].lower()
32
+ if ext == 'txt':
33
+ return StringIO(file.getvalue().decode("utf-8")).read()
34
+ elif ext == 'pdf':
35
+ return extract_text_from_pdf(file)
36
+ elif ext == 'docx':
37
+ return docx2txt.process(file)
38
+ else:
39
+ st.warning(f"Unsupported file type: {ext}")
40
+ return ""
41
+ except Exception as e:
42
+ st.error(f"Error loading file: {e}")
43
+ return ""
44
+
45
+ def highlight_diff(text1, text2):
46
+ differ = difflib.Differ()
47
+ diff = differ.compare(text1.split(), text2.split())
48
+ html = ""
49
+ for word in diff:
50
+ if word.startswith("- "):
51
+ html += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
52
+ elif word.startswith("+ "):
53
+ html += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
54
+ else:
55
+ html += word[2:] + " "
56
+ return html
57
+
58
+ def compute_similarity(text1, text2):
59
+ if not text1.strip() or not text2.strip():
60
+ return 0.0
61
+ try:
62
+ tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
63
+ tfidf_matrix = tfidf.fit_transform([text1, text2])
64
+ sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
65
+ return sim[0][0] * 100
66
+ except:
67
+ return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
68
+
69
+ # ========== MAIN ==========
70
+ def main():
71
+ st.title("📑 Contract Analyzer")
72
+ st.markdown("Upload two contracts, compare them, and ask any question!")
73
+
74
+ # Upload documents
75
+ st.header("1. Upload Documents")
76
+ col1, col2 = st.columns(2)
77
+ with col1:
78
+ file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
79
+ with col2:
80
+ file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
81
+
82
+ text1, text2 = "", ""
83
+ if file1: text1 = load_text(file1)
84
+ if file2: text2 = load_text(file2)
85
+
86
+ if not (text1 and text2):
87
+ st.warning("Please upload both documents to continue.")
88
+ return
89
+
90
+ # Display uploaded texts
91
+ st.header("2. Documents Content")
92
+ col1, col2 = st.columns(2)
93
+ with col1:
94
+ st.subheader("First Document")
95
+ st.text_area("Content of first document:", text1, height=300)
96
+ with col2:
97
+ st.subheader("Second Document")
98
+ st.text_area("Content of second document:", text2, height=300)
99
+
100
+ # Compare documents
101
+ st.header("3. Compare Documents")
102
+ if st.button("Compare Documents"):
103
+ sim_score = compute_similarity(text1, text2)
104
+ st.metric("Similarity Score", f"{sim_score:.2f}%")
105
+ diff_html = highlight_diff(text1, text2)
106
+ st.markdown("**Differences Highlighted:**", unsafe_allow_html=True)
107
+ st.markdown(f"<div style='border:1px solid #ccc; padding:10px; max-height:400px; overflow:auto'>{diff_html}</div>", unsafe_allow_html=True)
108
+
109
+ # Ask any question
110
+ st.header("4. Ask a Question")
111
+ user_question = st.text_input("Enter your question about the contracts:")
112
+
113
+ if user_question and st.button("Analyze Question"):
114
+ col1, col2 = st.columns(2)
115
+ with col1:
116
+ st.subheader("Answer from Document 1")
117
+ with st.spinner("Analyzing..."):
118
+ try:
119
+ pred1 = run_prediction([user_question], text1, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
120
+ st.success(pred1.get('0', 'No answer found'))
121
+ except Exception as e:
122
+ st.error(f"Failed on Document 1: {e}")
123
+
124
+ with col2:
125
+ st.subheader("Answer from Document 2")
126
+ with st.spinner("Analyzing..."):
127
+ try:
128
+ pred2 = run_prediction([user_question], text2, model_name='marshmellow77/roberta-base-cuad', n_best_size=5)
129
+ st.success(pred2.get('0', 'No answer found'))
130
+ except Exception as e:
131
+ st.error(f"Failed on Document 2: {e}")
132
+
133
+ if __name__ == "__main__":
134
+ main()
predict.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Tải model sẵn để khỏi load nhiều lần
4
+ qa_pipeline = pipeline(
5
+ "question-answering",
6
+ model="marshmellow77/roberta-base-cuad",
7
+ tokenizer="marshmellow77/roberta-base-cuad"
8
+ )
9
+
10
+ def run_prediction(questions, context, model_name=None, n_best_size=5):
11
+ """
12
+ - questions: list các câu hỏi (ví dụ ['What is the payment term?'])
13
+ - context: đoạn văn bản (hợp đồng) để tìm câu trả lời
14
+ - model_name: không cần, để giữ nguyên cho tương thích
15
+ - n_best_size: không cần, giữ nguyên để gọi
16
+ """
17
+ answers = {}
18
+ for idx, question in enumerate(questions):
19
+ result = qa_pipeline({
20
+ 'context': context,
21
+ 'question': question
22
+ })
23
+ answers[str(idx)] = result['answer']
24
+ return answers
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ scikit-learn
3
+ pdfplumber
4
+ PyPDF4
5
+ docx2txt
6
+ transformers
7
+ torch