Spaces:

datascientist22
/

rag-pdfQA-chatbot

Sleeping

App Files Files Community

rag-pdfQA-chatbot / app.py

datascientist22

Update app.py

198dc13 verified 9 months ago

raw

history blame

3.22 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	# Initialize the tokenizer and model from the saved checkpoint
	tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
	model = AutoModelForCausalLM.from_pretrained(
	"himmeow/vi-gemma-2b-RAG",
	device_map="auto",
	torch_dtype=torch.bfloat16
	)

	# Use GPU if available
	if torch.cuda.is_available():
	model.to("cuda")

	# Set up the Streamlit app layout
	st.set_page_config(page_title="RAG PDF Chatbot", layout="wide")

	# Sidebar with file upload and app title with creator details
	st.sidebar.title("📁 PDF Upload")
	uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)

	# Multicolor sidebar background
	st.sidebar.markdown("""
	<style>
	.sidebar .sidebar-content {
	background: linear-gradient(135deg, #ff9a9e, #fad0c4 40%, #fad0c4 60%, #ff9a9e);
	color: white;
	}
	</style>
	""", unsafe_allow_html=True)

	st.sidebar.markdown("""
	### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)
	""")

	# Main title
	st.markdown("""
	<h1 style='text-align: center; color: #ff6f61;'>📜 RAG PDF Chatbot</h1>
	""", unsafe_allow_html=True)

	# Multicolor background for the main content
	st.markdown("""
	<style>
	body {
	background: linear-gradient(135deg, #89f7fe 0%, #66a6ff 100%);
	}
	</style>
	""", unsafe_allow_html=True)

	# Input field for user queries
	query = st.text_input("Enter your query here:")
	submit_button = st.button("Submit")

	# Initialize chat history
	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []

	# Function to extract text from PDF files
	def extract_text_from_pdfs(files):
	text = ""
	for uploaded_file in files:
	reader = PdfReader(uploaded_file)
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	# Handle the query submission
	if submit_button and query:
	# Extract text from uploaded PDFs
	if uploaded_files:
	pdf_text = extract_text_from_pdfs(uploaded_files)

	# Prepare the input prompt
	prompt = f"""
	Based on the following context/document:
	{pdf_text}
	Please answer the question: {query}
	"""

	# Encode the input text
	input_ids = tokenizer(prompt, return_tensors="pt")

	# Use GPU for input ids if available
	if torch.cuda.is_available():
	input_ids = input_ids.to("cuda")

	# Generate the response
	outputs = model.generate(
	**input_ids,
	max_new_tokens=500,
	no_repeat_ngram_size=5,
	)

	# Decode the response and clean it
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	clean_response = response.strip()

	# Update chat history
	st.session_state.chat_history.append((query, clean_response))

	# Display chat history
	if st.session_state.chat_history:
	for q, a in st.session_state.chat_history:
	st.markdown(f"Question: {q}")
	st.markdown(f"Answer: {a}")
	st.write("---")