Neha13's picture
Create app.py
c55ed56 verified
import streamlit as st
import os
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import VectorStoreRetriever
import streamlit.components.v1 as components
from streamlit_pdf_viewer import pdf_viewer
from io import BytesIO
import base64
if 'pdf_ref' not in st.session_state:
st.session_state.pdf_ref = None
# Initialize the Groq API Key and the model
os.environ["GROQ_API_KEY"] = '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'
# config = {'max_new_tokens': 512, 'context_length': 8000}
llm = ChatGroq(
model='llama3-70b-8192',
temperature=0.5,
max_tokens=None,
timeout=None,
max_retries=2
)
# Define OCR functions for image and PDF files
def ocr_image(image_path, language='eng+guj'):
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang=language)
return text
def ocr_pdf(pdf_path, language='eng+guj'):
images = convert_from_path(pdf_path)
all_text = ""
for img in images:
text = pytesseract.image_to_string(img, lang=language)
all_text += text + "\n"
return all_text
def ocr_file(file_path):
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == ".pdf":
text_re = ocr_pdf(file_path, language='guj+eng')
elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
text_re = ocr_image(file_path, language='guj+eng')
else:
raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")
return text_re
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(text)
return chunks
# Function to create or update the vector store
def get_vector_store(text_chunks):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
# Ensure the directory exists before saving the vector store
os.makedirs("faiss_index", exist_ok=True)
vector_store.save_local("faiss_index")
return vector_store
# Function to process multiple files and extract vector store
def process_ocr_and_pdf_files(file_paths):
raw_text = ""
for file_path in file_paths:
raw_text += ocr_file(file_path) + "\n"
text_chunks = get_text_chunks(raw_text)
return get_vector_store(text_chunks)
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
# new_vector_store = FAISS.load_local(
# "faiss_index", embeddings, allow_dangerous_deserialization=True
# )
# docs = new_vector_store.similarity_search("qux")
# Conversational chain for Q&A
def get_conversational_chain():
template = """Core Identity & Responsibilities
Role: Official AI Assistant for Admission Committee for Professional Courses (ACPC), Gujarat
Mission: Process OCR-extracted text and provide clear, direct guidance on admissions and scholarships
Focus: Deliver user-friendly responses while handling OCR complexities internally
Processing Framework
1. Text & Document Processing
Process OCR-extracted text from various document types with attention to tables and structured data
Internally identify and handle OCR errors without explicitly mentioning them unless critical
Preserve tabular structures and relationships between data points
Present information in clean, readable formats regardless of source OCR quality
2. Language Handling
Support seamless communication in both Gujarati and English
Respond in the same language as the user's query
Present technical terms in both languages when relevant
Adjust language complexity to user comprehension level
3. Response Principles
Provide direct, concise answers (2-3 sentences for simple queries)
Skip unnecessary OCR quality disclaimers unless information is critically ambiguous
Present information in user-friendly formats, especially for tables and numerical data
Maintain professional yet conversational tone
Query Handling Strategies
1. Direct Information Queries
Provide straightforward answers without mentioning OCR processing
Example:
User: "What is the last date for application submission?"
Response: "The last date for application submission is June 15, 2025."
(NOT: "Based on the OCR-processed text, the last date appears to be...")
2. Table Data Extraction
Present tabular information in clean, structured format
Preserve relationships between data points
Example:
User: "What are the fees for different courses?"
Response:
"The fees for various courses are:
B.Tech: ₹1,15,000 (General), ₹58,000 (SC/ST)
B.Pharm: ₹85,000 (General), ₹42,500 (SC/ST)"
(NOT: "According to the OCR-extracted table, which may have quality issues...")
3. Ambiguous Information Handling
If OCR quality affects critical information (like dates, amounts, eligibility):
Provide the most likely correct information
Add a brief note suggesting verification only for critical information
Example: "The application deadline is June 15, 2025. For this important deadline, we recommend confirming on the official ACPC website."
4. Uncertain Information Protocol
For critically unclear OCR content:
State the most probable information
Add a simple verification suggestion without mentioning OCR
Example: "Based on the available information, the income limit appears to be ₹6,00,000. For this critical criterion, please verify on the official ACPC portal."
5. Structured Document Navigation
Present information in the same logical structure as the original document
Use headings and bullet points for clarity when appropriate
Maintain document hierarchies when explaining multi-step processes
6. Out-of-Scope Queries
Politely redirect without mentioning document or OCR limitations
Example: "This query is outside the scope of ACPC admission guidelines. For information about [topic], please contact [appropriate authority]."
7. Key Information Emphasis
Highlight critical information like deadlines, eligibility criteria, and document requirements
Make important numerical data visually distinct
Prioritize accuracy for dates, amounts, and eligibility requirements
8. Multi-Part Query Handling
Address each component of multi-part queries separately
Maintain logical flow between related pieces of information
Preserve context when explaining complex processes
9. Completeness Guidelines
Ensure responses cover all aspects of user queries
Provide step-by-step guidance for procedural questions
Include relevant related information that users might need
10. Response Quality Control
Internally verify numerical data consistency
Apply contextual understanding to identify potential OCR errors without mentioning them
Present information with confidence unless critically uncertain
Focus on delivering actionable information rather than discussing document limitations
Input:
OCR-processed text from uploaded documents: {context}
Chat History: {history}
Current Question: {question}
Output:
Give a clear, direct, and user-friendly response that focuses on the information itself rather than its OCR source. Present information confidently, mentioning verification only for critically important or potentially ambiguous details.
"""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
new_vector_store = FAISS.load_local(
"faiss_index", embeddings, allow_dangerous_deserialization=True
)
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=new_vector_store.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={"verbose": True,"prompt": QA_CHAIN_PROMPT,"memory": ConversationBufferMemory(memory_key="history",input_key="question"),})
return qa_chain
def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
file_path = os.path.join("temp", uploaded_file.name)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Show document in the main panel and optionally in the sidebar
if show_in_sidebar:
st.sidebar.write(f"### File: {uploaded_file.name}")
# if file_extension == ".pdf":
# st.session_state.pdf_ref = uploaded_file # Save the PDF to session state
# binary_data = st.session_state.pdf_ref.getvalue() # Get the binary data of the PDF
# # Use the pdf_viewer to display the PDF
# # sidebar.pdf_viewer(input=binary_data, width=700)
if file_extension == ".pdf":
# Display the PDF in the sidebar by embedding the PDF file
with open(file_path, "rb") as pdf_file:
pdf_data = pdf_file.read()
# Use the HTML iframe to display the PDF in the sidebar
pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
st.sidebar.markdown(f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="500" height="500"></iframe>', unsafe_allow_html=True)
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
img = Image.open(file_path)
st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_container_width=True) # Updated here
else:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
st.sidebar.text_area("File Content", content, height=300)
# Optionally show document in the main content area
# st.write(f"### Main Panel - {uploaded_file.name}")
# if file_extension == '.pdf':
# st.write("Displaying PDF:")
# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.text_area("File Content", content, height=300)
def user_input(user_question):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
chain = get_conversational_chain()
response = chain({"input_documents": docs, "query": user_question}, return_only_outputs=True)
result = response.get("result", "No result found")
# Save the question and answer to session state for history tracking
if 'conversation_history' not in st.session_state:
st.session_state.conversation_history = []
# Append new question and response to the history
st.session_state.conversation_history.append({'question': user_question, 'answer': result})
return result
# def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
# file_extension = os.path.splitext(uploaded_file.name)[1].lower()
# file_path = os.path.join("temp", uploaded_file.name)
# os.makedirs(os.path.dirname(file_path), exist_ok=True)
# with open(file_path, "wb") as f:
# f.write(uploaded_file.getbuffer())
# # Show document in the main panel and optionally in the sidebar
# if show_in_sidebar:
# st.sidebar.write(f"### File: {uploaded_file.name}")
# if file_extension == '.pdf':
# st.sidebar.write("Displaying PDF:")
# st.sidebar.components.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# # st.sidebar.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.sidebar.text_area("File Content", content, height=300)
# Optionally show document in the main content area
# st.write(f"### Main Panel - {uploaded_file.name}")
# if file_extension == '.pdf':
# st.write("Displaying PDF:")
# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.text_area("File Content", content, height=300)
# Streamlit app to upload files and interact with the Q&A system
def main():
st.title("File Upload and OCR Processing")
st.write("Upload up to 5 files (PDF, JPG, JPEG, PNG, BMP)")
uploaded_files = st.file_uploader("Choose files", type=["pdf", "jpg", "jpeg", "png", "bmp"], accept_multiple_files=True)
if len(uploaded_files) > 0:
file_paths = []
# Save uploaded files and process them
for uploaded_file in uploaded_files[:5]: # Limit to 5 files
file_path = os.path.join("temp", uploaded_file.name)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
file_paths.append(file_path)
# Process the OCR and PDF files and store the vector data
st.write("Processing files...")
vector_store = process_ocr_and_pdf_files(file_paths)
st.write("Processing completed! The vector store has been updated.")
show_in_sidebar = st.sidebar.checkbox("Show files in Sidebar", value=True)
if len(uploaded_files) > 0:
# Process and display each uploaded file in its format
for uploaded_file in uploaded_files:
handle_uploaded_file(uploaded_file, show_in_sidebar)
# Ask user for a question related to the documents
user_question = st.text_input("Ask a question related to the uploaded documents:")
if user_question:
response = user_input(user_question)
st.write("Answer:", response)
# Button to display chat history
# if st.button("Show Chat History"):
# history = st.session_state.get('history', [])
# if history:
# st.write("Conversation History:")
# for idx, (q, a) in enumerate(history):
# st.write(f"Q{idx+1}: {q}")
# st.write(f"A{idx+1}: {a}")
# else:
# st.write("No conversation history.")
with st.expander('Conversation History'):
for entry in st.session_state.conversation_history:
st.info(f"Q: {entry['question']}\nA: {entry['answer']}")
if __name__ == "__main__":
main()