Spaces:

datascientist22
/

rag-pdfQA-chatbot

Sleeping

App Files Files Community

datascientist22 commited on Sep 3, 2024

Commit

5e9dd30

verified ·

1 Parent(s): 6feb14e

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -55

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-from PyPDF2 import PdfReader
 # Initialize the tokenizer and model from the saved checkpoint
 tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
@@ -15,71 +15,97 @@ model = AutoModelForCausalLM.from_pretrained(
 if torch.cuda.is_available():
     model.to("cuda")
-# Function to extract text from PDF
-def extract_text_from_pdf(pdf_path):
-    pdf_text = ""
-    with open(pdf_path, "rb") as file:
-        reader = PdfReader(file)
-        for page_num in range(len(reader.pages)):
-            page = reader.pages[page_num]
-            text = page.extract_text()
-            pdf_text += text + "\n"
-    return pdf_text
-# Streamlit app
-st.write("**Created by: Engr. Hamesh Raj** [LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
-st.title("📄 PDF Question Answering")
-# Sidebar for PDF upload
-uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
-if uploaded_file is not None:
-    # Extract text from the uploaded PDF
-    pdf_text = extract_text_from_pdf(uploaded_file)
-    st.text_area("Extracted PDF Text", pdf_text, height=200)
-    # Input field for the user's question
-    user_query = st.text_input("Enter your question:")
-    # Display the submit button below the input field
-    if st.button("Submit") and user_query:
-        # Format the input text
-        input_text = f"{user_query}\n\n### Response:\n"
-        # Encode the input text into input ids
-        input_ids = tokenizer(input_text, return_tensors="pt")
         # Use GPU for input ids if available
         if torch.cuda.is_available():
             input_ids = input_ids.to("cuda")
-        # Generate text using the model
         outputs = model.generate(
             **input_ids,
-            max_new_tokens=150,  # Limit the number of tokens generated
-            no_repeat_ngram_size=5,  # Prevent repetition of 5-gram phrases
         )
-        # Decode and print the results
-        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Display question and answer
-        st.write(f"**Q{len(st.session_state) + 1}: {user_query}**")
-        st.write(f"**A{len(st.session_state) + 1}: {answer.strip()}**")
-        # Store in session state for chat history
-        if "history" not in st.session_state:
-            st.session_state.history = []
-        st.session_state.history.append({
-            "question": user_query,
-            "answer": answer.strip()
-        })
 # Display chat history
-if "history" in st.session_state:
-    for i, qa in enumerate(st.session_state.history):
-        st.write(f"**Q{i + 1}: {qa['question']}**")
-        st.write(f"**A{i + 1}: {qa['answer']}**")

 import streamlit as st
+from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 # Initialize the tokenizer and model from the saved checkpoint
 tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
 if torch.cuda.is_available():
     model.to("cuda")
+# Set up the Streamlit app layout
+st.set_page_config(page_title="RAG PDF Chatbot", layout="wide")
+# Sidebar with file upload and app title with creator details
+st.sidebar.title("📁 PDF Upload")
+uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
+# Multicolor sidebar background
+st.sidebar.markdown("""
+    <style>
+    .sidebar .sidebar-content {
+        background: linear-gradient(135deg, #ff9a9e, #fad0c4 40%, #fad0c4 60%, #ff9a9e);
+        color: white;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+st.sidebar.markdown("""
+### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)
+""")
+# Main title
+st.markdown("""
+    <h1 style='text-align: center; color: #ff6f61;'>📜 RAG PDF Chatbot</h1>
+    """, unsafe_allow_html=True)
+# Multicolor background for the main content
+st.markdown("""
+    <style>
+    body {
+        background: linear-gradient(135deg, #89f7fe 0%, #66a6ff 100%);
+    }
+    </style>
+    """, unsafe_allow_html=True)
+# Input field for user queries
+query = st.text_input("Enter your query here:")
+submit_button = st.button("Submit")
+# Initialize chat history
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+# Function to extract text from PDF files
+def extract_text_from_pdfs(files):
+    text = ""
+    for uploaded_file in files:
+        reader = PdfReader(uploaded_file)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+# Handle the query submission
+if submit_button and query:
+    # Extract text from uploaded PDFs
+    if uploaded_files:
+        pdf_text = extract_text_from_pdfs(uploaded_files)
+        # Prepare the input prompt
+        prompt = f"""
+        ### Instruction and Input:
+        Based on the following context/document:
+        {pdf_text}
+        Please answer the question: {query}
+        ### Response:
+        """
+        # Encode the input text
+        input_ids = tokenizer(prompt, return_tensors="pt")
         # Use GPU for input ids if available
         if torch.cuda.is_available():
             input_ids = input_ids.to("cuda")
+        # Generate the response
         outputs = model.generate(
             **input_ids,
+            max_new_tokens=500,
+            no_repeat_ngram_size=5,
         )
+        # Decode the response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Update chat history
+        st.session_state.chat_history.append((query, response))
 # Display chat history
+if st.session_state.chat_history:
+    for i, (q, a) in enumerate(st.session_state.chat_history):
+        st.markdown(f"**Question {i + 1}:** {q}")
+        st.markdown(f"**Answer:** {a}")
+        st.write("---")