import streamlit as st import os from PIL import Image import pytesseract from pdf2image import convert_from_path from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from langchain.memory import ConversationBufferMemory from langchain_groq import ChatGroq from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.vectorstores import VectorStoreRetriever import streamlit.components.v1 as components from streamlit_pdf_viewer import pdf_viewer from io import BytesIO import base64 if 'pdf_ref' not in st.session_state: st.session_state.pdf_ref = None # Initialize the Groq API Key and the model os.environ["GROQ_API_KEY"] = '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' # config = {'max_new_tokens': 512, 'context_length': 8000} llm = ChatGroq( model='llama3-70b-8192', temperature=0.5, max_tokens=None, timeout=None, max_retries=2 ) # Define OCR functions for image and PDF files def ocr_image(image_path, language='eng+guj'): img = Image.open(image_path) text = pytesseract.image_to_string(img, lang=language) return text def ocr_pdf(pdf_path, language='eng+guj'): images = convert_from_path(pdf_path) all_text = "" for img in images: text = pytesseract.image_to_string(img, lang=language) all_text += text + "\n" return all_text def ocr_file(file_path): file_extension = os.path.splitext(file_path)[1].lower() if file_extension == ".pdf": text_re = ocr_pdf(file_path, language='guj+eng') elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]: text_re = ocr_image(file_path, language='guj+eng') else: raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.") return text_re def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) chunks = text_splitter.split_text(text) return chunks # Function to create or update the vector store def get_vector_store(text_chunks): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}) vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) # Ensure the directory exists before saving the vector store os.makedirs("faiss_index", exist_ok=True) vector_store.save_local("faiss_index") return vector_store # Function to process multiple files and extract vector store def process_ocr_and_pdf_files(file_paths): raw_text = "" for file_path in file_paths: raw_text += ocr_file(file_path) + "\n" text_chunks = get_text_chunks(raw_text) return get_vector_store(text_chunks) # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}) # new_vector_store = FAISS.load_local( # "faiss_index", embeddings, allow_dangerous_deserialization=True # ) # docs = new_vector_store.similarity_search("qux") # Conversational chain for Q&A def get_conversational_chain(): template = """Core Identity & Responsibilities Role: Official AI Assistant for Admission Committee for Professional Courses (ACPC), Gujarat Mission: Process OCR-extracted text and provide clear, direct guidance on admissions and scholarships Focus: Deliver user-friendly responses while handling OCR complexities internally Processing Framework 1. Text & Document Processing Process OCR-extracted text from various document types with attention to tables and structured data Internally identify and handle OCR errors without explicitly mentioning them unless critical Preserve tabular structures and relationships between data points Present information in clean, readable formats regardless of source OCR quality 2. Language Handling Support seamless communication in both Gujarati and English Respond in the same language as the user's query Present technical terms in both languages when relevant Adjust language complexity to user comprehension level 3. Response Principles Provide direct, concise answers (2-3 sentences for simple queries) Skip unnecessary OCR quality disclaimers unless information is critically ambiguous Present information in user-friendly formats, especially for tables and numerical data Maintain professional yet conversational tone Query Handling Strategies 1. Direct Information Queries Provide straightforward answers without mentioning OCR processing Example: User: "What is the last date for application submission?" Response: "The last date for application submission is June 15, 2025." (NOT: "Based on the OCR-processed text, the last date appears to be...") 2. Table Data Extraction Present tabular information in clean, structured format Preserve relationships between data points Example: User: "What are the fees for different courses?" Response: "The fees for various courses are: B.Tech: ₹1,15,000 (General), ₹58,000 (SC/ST) B.Pharm: ₹85,000 (General), ₹42,500 (SC/ST)" (NOT: "According to the OCR-extracted table, which may have quality issues...") 3. Ambiguous Information Handling If OCR quality affects critical information (like dates, amounts, eligibility): Provide the most likely correct information Add a brief note suggesting verification only for critical information Example: "The application deadline is June 15, 2025. For this important deadline, we recommend confirming on the official ACPC website." 4. Uncertain Information Protocol For critically unclear OCR content: State the most probable information Add a simple verification suggestion without mentioning OCR Example: "Based on the available information, the income limit appears to be ₹6,00,000. For this critical criterion, please verify on the official ACPC portal." 5. Structured Document Navigation Present information in the same logical structure as the original document Use headings and bullet points for clarity when appropriate Maintain document hierarchies when explaining multi-step processes 6. Out-of-Scope Queries Politely redirect without mentioning document or OCR limitations Example: "This query is outside the scope of ACPC admission guidelines. For information about [topic], please contact [appropriate authority]." 7. Key Information Emphasis Highlight critical information like deadlines, eligibility criteria, and document requirements Make important numerical data visually distinct Prioritize accuracy for dates, amounts, and eligibility requirements 8. Multi-Part Query Handling Address each component of multi-part queries separately Maintain logical flow between related pieces of information Preserve context when explaining complex processes 9. Completeness Guidelines Ensure responses cover all aspects of user queries Provide step-by-step guidance for procedural questions Include relevant related information that users might need 10. Response Quality Control Internally verify numerical data consistency Apply contextual understanding to identify potential OCR errors without mentioning them Present information with confidence unless critically uncertain Focus on delivering actionable information rather than discussing document limitations Input: OCR-processed text from uploaded documents: {context} Chat History: {history} Current Question: {question} Output: Give a clear, direct, and user-friendly response that focuses on the information itself rather than its OCR source. Present information confidently, mentioning verification only for critically important or potentially ambiguous details. """ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}) new_vector_store = FAISS.load_local( "faiss_index", embeddings, allow_dangerous_deserialization=True ) QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template) qa_chain = RetrievalQA.from_chain_type(llm, retriever=new_vector_store.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={"verbose": True,"prompt": QA_CHAIN_PROMPT,"memory": ConversationBufferMemory(memory_key="history",input_key="question"),}) return qa_chain def handle_uploaded_file(uploaded_file, show_in_sidebar=False): file_extension = os.path.splitext(uploaded_file.name)[1].lower() file_path = os.path.join("temp", uploaded_file.name) os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) # Show document in the main panel and optionally in the sidebar if show_in_sidebar: st.sidebar.write(f"### File: {uploaded_file.name}") # if file_extension == ".pdf": # st.session_state.pdf_ref = uploaded_file # Save the PDF to session state # binary_data = st.session_state.pdf_ref.getvalue() # Get the binary data of the PDF # # Use the pdf_viewer to display the PDF # # sidebar.pdf_viewer(input=binary_data, width=700) if file_extension == ".pdf": # Display the PDF in the sidebar by embedding the PDF file with open(file_path, "rb") as pdf_file: pdf_data = pdf_file.read() # Use the HTML iframe to display the PDF in the sidebar pdf_base64 = base64.b64encode(pdf_data).decode('utf-8') st.sidebar.markdown(f'', unsafe_allow_html=True) elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: img = Image.open(file_path) st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_container_width=True) # Updated here else: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() st.sidebar.text_area("File Content", content, height=300) # Optionally show document in the main content area # st.write(f"### Main Panel - {uploaded_file.name}") # if file_extension == '.pdf': # st.write("Displaying PDF:") # st.components.v1.html(f'') # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: # img = Image.open(file_path) # st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True) # else: # with open(file_path, 'r', encoding='utf-8') as f: # content = f.read() # st.text_area("File Content", content, height=300) def user_input(user_question): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}) new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) docs = new_db.similarity_search(user_question) chain = get_conversational_chain() response = chain({"input_documents": docs, "query": user_question}, return_only_outputs=True) result = response.get("result", "No result found") # Save the question and answer to session state for history tracking if 'conversation_history' not in st.session_state: st.session_state.conversation_history = [] # Append new question and response to the history st.session_state.conversation_history.append({'question': user_question, 'answer': result}) return result # def handle_uploaded_file(uploaded_file, show_in_sidebar=False): # file_extension = os.path.splitext(uploaded_file.name)[1].lower() # file_path = os.path.join("temp", uploaded_file.name) # os.makedirs(os.path.dirname(file_path), exist_ok=True) # with open(file_path, "wb") as f: # f.write(uploaded_file.getbuffer()) # # Show document in the main panel and optionally in the sidebar # if show_in_sidebar: # st.sidebar.write(f"### File: {uploaded_file.name}") # if file_extension == '.pdf': # st.sidebar.write("Displaying PDF:") # st.sidebar.components.html(f'') # # st.sidebar.components.v1.html(f'') # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: # img = Image.open(file_path) # st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True) # else: # with open(file_path, 'r', encoding='utf-8') as f: # content = f.read() # st.sidebar.text_area("File Content", content, height=300) # Optionally show document in the main content area # st.write(f"### Main Panel - {uploaded_file.name}") # if file_extension == '.pdf': # st.write("Displaying PDF:") # st.components.v1.html(f'') # elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: # img = Image.open(file_path) # st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True) # else: # with open(file_path, 'r', encoding='utf-8') as f: # content = f.read() # st.text_area("File Content", content, height=300) # Streamlit app to upload files and interact with the Q&A system def main(): st.title("File Upload and OCR Processing") st.write("Upload up to 5 files (PDF, JPG, JPEG, PNG, BMP)") uploaded_files = st.file_uploader("Choose files", type=["pdf", "jpg", "jpeg", "png", "bmp"], accept_multiple_files=True) if len(uploaded_files) > 0: file_paths = [] # Save uploaded files and process them for uploaded_file in uploaded_files[:5]: # Limit to 5 files file_path = os.path.join("temp", uploaded_file.name) os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) file_paths.append(file_path) # Process the OCR and PDF files and store the vector data st.write("Processing files...") vector_store = process_ocr_and_pdf_files(file_paths) st.write("Processing completed! The vector store has been updated.") show_in_sidebar = st.sidebar.checkbox("Show files in Sidebar", value=True) if len(uploaded_files) > 0: # Process and display each uploaded file in its format for uploaded_file in uploaded_files: handle_uploaded_file(uploaded_file, show_in_sidebar) # Ask user for a question related to the documents user_question = st.text_input("Ask a question related to the uploaded documents:") if user_question: response = user_input(user_question) st.write("Answer:", response) # Button to display chat history # if st.button("Show Chat History"): # history = st.session_state.get('history', []) # if history: # st.write("Conversation History:") # for idx, (q, a) in enumerate(history): # st.write(f"Q{idx+1}: {q}") # st.write(f"A{idx+1}: {a}") # else: # st.write("No conversation history.") with st.expander('Conversation History'): for entry in st.session_state.conversation_history: st.info(f"Q: {entry['question']}\nA: {entry['answer']}") if __name__ == "__main__": main()