Neha13 commited on
Commit
c55ed56
·
verified ·
1 Parent(s): e37cc1c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +372 -0
app.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from PIL import Image
4
+ import pytesseract
5
+ from pdf2image import convert_from_path
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain_groq import ChatGroq
11
+ from langchain_community.vectorstores import FAISS
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_core.vectorstores import VectorStoreRetriever
14
+ import streamlit.components.v1 as components
15
+ from streamlit_pdf_viewer import pdf_viewer
16
+ from io import BytesIO
17
+ import base64
18
+
19
+ if 'pdf_ref' not in st.session_state:
20
+ st.session_state.pdf_ref = None
21
+
22
+ # Initialize the Groq API Key and the model
23
+ os.environ["GROQ_API_KEY"] = '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'
24
+ # config = {'max_new_tokens': 512, 'context_length': 8000}
25
+ llm = ChatGroq(
26
+ model='llama3-70b-8192',
27
+ temperature=0.5,
28
+ max_tokens=None,
29
+ timeout=None,
30
+ max_retries=2
31
+ )
32
+
33
+ # Define OCR functions for image and PDF files
34
+ def ocr_image(image_path, language='eng+guj'):
35
+ img = Image.open(image_path)
36
+ text = pytesseract.image_to_string(img, lang=language)
37
+ return text
38
+
39
+ def ocr_pdf(pdf_path, language='eng+guj'):
40
+ images = convert_from_path(pdf_path)
41
+ all_text = ""
42
+ for img in images:
43
+ text = pytesseract.image_to_string(img, lang=language)
44
+ all_text += text + "\n"
45
+ return all_text
46
+
47
+ def ocr_file(file_path):
48
+ file_extension = os.path.splitext(file_path)[1].lower()
49
+
50
+ if file_extension == ".pdf":
51
+ text_re = ocr_pdf(file_path, language='guj+eng')
52
+ elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
53
+ text_re = ocr_image(file_path, language='guj+eng')
54
+ else:
55
+ raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")
56
+
57
+ return text_re
58
+
59
+ def get_text_chunks(text):
60
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
61
+ chunks = text_splitter.split_text(text)
62
+ return chunks
63
+
64
+ # Function to create or update the vector store
65
+ def get_vector_store(text_chunks):
66
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
67
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
68
+
69
+ # Ensure the directory exists before saving the vector store
70
+ os.makedirs("faiss_index", exist_ok=True)
71
+ vector_store.save_local("faiss_index")
72
+
73
+ return vector_store
74
+
75
+ # Function to process multiple files and extract vector store
76
+ def process_ocr_and_pdf_files(file_paths):
77
+ raw_text = ""
78
+ for file_path in file_paths:
79
+ raw_text += ocr_file(file_path) + "\n"
80
+ text_chunks = get_text_chunks(raw_text)
81
+ return get_vector_store(text_chunks)
82
+
83
+ # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
84
+ # new_vector_store = FAISS.load_local(
85
+ # "faiss_index", embeddings, allow_dangerous_deserialization=True
86
+ # )
87
+
88
+ # docs = new_vector_store.similarity_search("qux")
89
+ # Conversational chain for Q&A
90
+ def get_conversational_chain():
91
+ template = """Core Identity & Responsibilities
92
+
93
+ Role: Official AI Assistant for Admission Committee for Professional Courses (ACPC), Gujarat
94
+ Mission: Process OCR-extracted text and provide clear, direct guidance on admissions and scholarships
95
+ Focus: Deliver user-friendly responses while handling OCR complexities internally
96
+
97
+ Processing Framework
98
+ 1. Text & Document Processing
99
+
100
+ Process OCR-extracted text from various document types with attention to tables and structured data
101
+ Internally identify and handle OCR errors without explicitly mentioning them unless critical
102
+ Preserve tabular structures and relationships between data points
103
+ Present information in clean, readable formats regardless of source OCR quality
104
+
105
+ 2. Language Handling
106
+
107
+ Support seamless communication in both Gujarati and English
108
+ Respond in the same language as the user's query
109
+ Present technical terms in both languages when relevant
110
+ Adjust language complexity to user comprehension level
111
+
112
+ 3. Response Principles
113
+
114
+ Provide direct, concise answers (2-3 sentences for simple queries)
115
+ Skip unnecessary OCR quality disclaimers unless information is critically ambiguous
116
+ Present information in user-friendly formats, especially for tables and numerical data
117
+ Maintain professional yet conversational tone
118
+
119
+ Query Handling Strategies
120
+ 1. Direct Information Queries
121
+
122
+ Provide straightforward answers without mentioning OCR processing
123
+ Example:
124
+ User: "What is the last date for application submission?"
125
+ Response: "The last date for application submission is June 15, 2025."
126
+ (NOT: "Based on the OCR-processed text, the last date appears to be...")
127
+
128
+ 2. Table Data Extraction
129
+
130
+ Present tabular information in clean, structured format
131
+ Preserve relationships between data points
132
+ Example:
133
+ User: "What are the fees for different courses?"
134
+ Response:
135
+ "The fees for various courses are:
136
+
137
+ B.Tech: ₹1,15,000 (General), ₹58,000 (SC/ST)
138
+ B.Pharm: ₹85,000 (General), ₹42,500 (SC/ST)"
139
+ (NOT: "According to the OCR-extracted table, which may have quality issues...")
140
+
141
+
142
+
143
+ 3. Ambiguous Information Handling
144
+
145
+ If OCR quality affects critical information (like dates, amounts, eligibility):
146
+
147
+ Provide the most likely correct information
148
+ Add a brief note suggesting verification only for critical information
149
+ Example: "The application deadline is June 15, 2025. For this important deadline, we recommend confirming on the official ACPC website."
150
+
151
+
152
+
153
+ 4. Uncertain Information Protocol
154
+
155
+ For critically unclear OCR content:
156
+
157
+ State the most probable information
158
+ Add a simple verification suggestion without mentioning OCR
159
+ Example: "Based on the available information, the income limit appears to be ₹6,00,000. For this critical criterion, please verify on the official ACPC portal."
160
+
161
+
162
+
163
+ 5. Structured Document Navigation
164
+
165
+ Present information in the same logical structure as the original document
166
+ Use headings and bullet points for clarity when appropriate
167
+ Maintain document hierarchies when explaining multi-step processes
168
+
169
+ 6. Out-of-Scope Queries
170
+
171
+ Politely redirect without mentioning document or OCR limitations
172
+ Example: "This query is outside the scope of ACPC admission guidelines. For information about [topic], please contact [appropriate authority]."
173
+
174
+ 7. Key Information Emphasis
175
+
176
+ Highlight critical information like deadlines, eligibility criteria, and document requirements
177
+ Make important numerical data visually distinct
178
+ Prioritize accuracy for dates, amounts, and eligibility requirements
179
+
180
+ 8. Multi-Part Query Handling
181
+
182
+ Address each component of multi-part queries separately
183
+ Maintain logical flow between related pieces of information
184
+ Preserve context when explaining complex processes
185
+
186
+ 9. Completeness Guidelines
187
+
188
+ Ensure responses cover all aspects of user queries
189
+ Provide step-by-step guidance for procedural questions
190
+ Include relevant related information that users might need
191
+
192
+ 10. Response Quality Control
193
+
194
+ Internally verify numerical data consistency
195
+ Apply contextual understanding to identify potential OCR errors without mentioning them
196
+ Present information with confidence unless critically uncertain
197
+ Focus on delivering actionable information rather than discussing document limitations
198
+
199
+ Input:
200
+ OCR-processed text from uploaded documents: {context}
201
+ Chat History: {history}
202
+ Current Question: {question}
203
+ Output:
204
+ Give a clear, direct, and user-friendly response that focuses on the information itself rather than its OCR source. Present information confidently, mentioning verification only for critically important or potentially ambiguous details.
205
+ """
206
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
207
+ new_vector_store = FAISS.load_local(
208
+ "faiss_index", embeddings, allow_dangerous_deserialization=True
209
+ )
210
+ QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template)
211
+ qa_chain = RetrievalQA.from_chain_type(llm, retriever=new_vector_store.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={"verbose": True,"prompt": QA_CHAIN_PROMPT,"memory": ConversationBufferMemory(memory_key="history",input_key="question"),})
212
+ return qa_chain
213
+
214
+ def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
215
+ file_extension = os.path.splitext(uploaded_file.name)[1].lower()
216
+ file_path = os.path.join("temp", uploaded_file.name)
217
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
218
+
219
+ with open(file_path, "wb") as f:
220
+ f.write(uploaded_file.getbuffer())
221
+
222
+ # Show document in the main panel and optionally in the sidebar
223
+ if show_in_sidebar:
224
+ st.sidebar.write(f"### File: {uploaded_file.name}")
225
+
226
+ # if file_extension == ".pdf":
227
+ # st.session_state.pdf_ref = uploaded_file # Save the PDF to session state
228
+ # binary_data = st.session_state.pdf_ref.getvalue() # Get the binary data of the PDF
229
+ # # Use the pdf_viewer to display the PDF
230
+ # # sidebar.pdf_viewer(input=binary_data, width=700)
231
+ if file_extension == ".pdf":
232
+ # Display the PDF in the sidebar by embedding the PDF file
233
+ with open(file_path, "rb") as pdf_file:
234
+ pdf_data = pdf_file.read()
235
+ # Use the HTML iframe to display the PDF in the sidebar
236
+ pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
237
+ st.sidebar.markdown(f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="500" height="500"></iframe>', unsafe_allow_html=True)
238
+
239
+ elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
240
+ img = Image.open(file_path)
241
+ st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_container_width=True) # Updated here
242
+ else:
243
+ with open(file_path, 'r', encoding='utf-8') as f:
244
+ content = f.read()
245
+ st.sidebar.text_area("File Content", content, height=300)
246
+
247
+
248
+
249
+ # Optionally show document in the main content area
250
+ # st.write(f"### Main Panel - {uploaded_file.name}")
251
+ # if file_extension == '.pdf':
252
+ # st.write("Displaying PDF:")
253
+ # st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
254
+ # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
255
+ # img = Image.open(file_path)
256
+ # st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
257
+ # else:
258
+ # with open(file_path, 'r', encoding='utf-8') as f:
259
+ # content = f.read()
260
+ # st.text_area("File Content", content, height=300)
261
+
262
+ def user_input(user_question):
263
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
264
+ new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
265
+ docs = new_db.similarity_search(user_question)
266
+ chain = get_conversational_chain()
267
+ response = chain({"input_documents": docs, "query": user_question}, return_only_outputs=True)
268
+ result = response.get("result", "No result found")
269
+
270
+ # Save the question and answer to session state for history tracking
271
+ if 'conversation_history' not in st.session_state:
272
+ st.session_state.conversation_history = []
273
+
274
+ # Append new question and response to the history
275
+ st.session_state.conversation_history.append({'question': user_question, 'answer': result})
276
+
277
+ return result
278
+
279
+ # def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
280
+ # file_extension = os.path.splitext(uploaded_file.name)[1].lower()
281
+ # file_path = os.path.join("temp", uploaded_file.name)
282
+ # os.makedirs(os.path.dirname(file_path), exist_ok=True)
283
+
284
+ # with open(file_path, "wb") as f:
285
+ # f.write(uploaded_file.getbuffer())
286
+
287
+ # # Show document in the main panel and optionally in the sidebar
288
+ # if show_in_sidebar:
289
+ # st.sidebar.write(f"### File: {uploaded_file.name}")
290
+ # if file_extension == '.pdf':
291
+ # st.sidebar.write("Displaying PDF:")
292
+ # st.sidebar.components.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
293
+
294
+ # # st.sidebar.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
295
+ # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
296
+ # img = Image.open(file_path)
297
+ # st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
298
+ # else:
299
+ # with open(file_path, 'r', encoding='utf-8') as f:
300
+ # content = f.read()
301
+ # st.sidebar.text_area("File Content", content, height=300)
302
+
303
+ # Optionally show document in the main content area
304
+ # st.write(f"### Main Panel - {uploaded_file.name}")
305
+ # if file_extension == '.pdf':
306
+ # st.write("Displaying PDF:")
307
+ # st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
308
+ # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
309
+ # img = Image.open(file_path)
310
+ # st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
311
+ # else:
312
+ # with open(file_path, 'r', encoding='utf-8') as f:
313
+ # content = f.read()
314
+ # st.text_area("File Content", content, height=300)
315
+
316
+ # Streamlit app to upload files and interact with the Q&A system
317
+ def main():
318
+ st.title("File Upload and OCR Processing")
319
+ st.write("Upload up to 5 files (PDF, JPG, JPEG, PNG, BMP)")
320
+
321
+
322
+ uploaded_files = st.file_uploader("Choose files", type=["pdf", "jpg", "jpeg", "png", "bmp"], accept_multiple_files=True)
323
+
324
+ if len(uploaded_files) > 0:
325
+ file_paths = []
326
+
327
+ # Save uploaded files and process them
328
+ for uploaded_file in uploaded_files[:5]: # Limit to 5 files
329
+ file_path = os.path.join("temp", uploaded_file.name)
330
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
331
+ with open(file_path, "wb") as f:
332
+ f.write(uploaded_file.getbuffer())
333
+ file_paths.append(file_path)
334
+
335
+
336
+ # Process the OCR and PDF files and store the vector data
337
+ st.write("Processing files...")
338
+ vector_store = process_ocr_and_pdf_files(file_paths)
339
+ st.write("Processing completed! The vector store has been updated.")
340
+
341
+ show_in_sidebar = st.sidebar.checkbox("Show files in Sidebar", value=True)
342
+
343
+ if len(uploaded_files) > 0:
344
+ # Process and display each uploaded file in its format
345
+ for uploaded_file in uploaded_files:
346
+ handle_uploaded_file(uploaded_file, show_in_sidebar)
347
+
348
+ # Ask user for a question related to the documents
349
+ user_question = st.text_input("Ask a question related to the uploaded documents:")
350
+
351
+ if user_question:
352
+ response = user_input(user_question)
353
+ st.write("Answer:", response)
354
+
355
+ # Button to display chat history
356
+
357
+ # if st.button("Show Chat History"):
358
+ # history = st.session_state.get('history', [])
359
+ # if history:
360
+ # st.write("Conversation History:")
361
+ # for idx, (q, a) in enumerate(history):
362
+ # st.write(f"Q{idx+1}: {q}")
363
+ # st.write(f"A{idx+1}: {a}")
364
+ # else:
365
+ # st.write("No conversation history.")
366
+ with st.expander('Conversation History'):
367
+ for entry in st.session_state.conversation_history:
368
+ st.info(f"Q: {entry['question']}\nA: {entry['answer']}")
369
+
370
+
371
+ if __name__ == "__main__":
372
+ main()