Spaces:

datascientist22
/

rag-pdfQA-chatbot

Sleeping

App Files Files Community

datascientist22 commited on Sep 3, 2024

Commit

62e64fb

verified ·

1 Parent(s): 977e550

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -98

app.py CHANGED Viewed

@@ -3,114 +3,72 @@ from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Set up the Streamlit app layout
-st.set_page_config(page_title="RAG PDF Chatbot", layout="wide")
-# Sidebar with file upload and app title with creator details
-st.sidebar.title("📁 PDF Upload")
-uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
-# Multicolor sidebar background
-st.sidebar.markdown("""
-    <style>
-    .sidebar .sidebar-content {
-        background: linear-gradient(135deg, #ff9a9e, #fad0c4 40%, #fad0c4 60%, #ff9a9e);
-        color: white;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-st.sidebar.markdown("""
-### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)
-""")
-# Main title
-st.markdown("""
-    <h1 style='text-align: center; color: #ff6f61;'>📜 RAG PDF Chatbot</h1>
-    """, unsafe_allow_html=True)
-# Multicolor background for the main content
-st.markdown("""
-    <style>
-    body {
-        background: linear-gradient(135deg, #89f7fe 0%, #66a6ff 100%);
-    }
-    </style>
-    """, unsafe_allow_html=True)
-# Input field for user queries
-query = st.text_input("Enter your query here:")
-submit_button = st.button("Submit")
-# Initialize chat history
-if 'chat_history' not in st.session_state:
-    st.session_state.chat_history = []
-# Load the tokenizer and model
-try:
-    tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
-    model = AutoModelForCausalLM.from_pretrained("himmeow/vi-gemma-2b-RAG")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
-except Exception as e:
-    st.error(f"Error loading model or tokenizer: {e}")
-    st.stop()
-# Function to extract text from PDF files
-def extract_text_from_pdfs(files):
-    text = ""
-    for uploaded_file in files:
-        try:
-            reader = PdfReader(uploaded_file)
-            for page in reader.pages:
-                text += page.extract_text() + "\n"
-        except Exception as e:
-            st.error(f"Error reading PDF file: {e}")
-    return text
-# Handle the query submission
-if submit_button:
-    if not uploaded_files:
-        st.warning("⚠️ Please upload at least one PDF file before submitting.")
-    elif not query:
-        st.warning("⚠️ Please enter a query before submitting.")
-    else:
-        try:
-            # Extract text from uploaded PDFs
-            pdf_text = extract_text_from_pdfs(uploaded_files)
-            if not pdf_text.strip():
-                st.warning("⚠️ No text found in the uploaded PDFs.")
-            else:
-                # Prepare the input prompt
-                prompt = f"""
-                Based on the following context/document:
-                {pdf_text}
-                Please answer the question: {query}
-                """
-                # Encode the input text
-                inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
-                # Generate the response
-                outputs = model.generate(
-                    input_ids=inputs['input_ids'].to(device),
-                    max_new_tokens=500,
-                    no_repeat_ngram_size=5,
-                )
-                # Decode the response and clean it
-                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-                clean_response = response.strip()
-                # Update chat history
-                st.session_state.chat_history.append((query, clean_response))
-        except Exception as e:
-            st.error(f"An error occurred during processing: {e}")
-# Display chat history
-if st.session_state.chat_history:
-    for q, a in st.session_state.chat_history:
-        st.markdown(f"**Question:** {q}")
-        st.markdown(f"**Answer:** {a}")
-        st.write("---")

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+# Initialize the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
+model = AutoModelForCausalLM.from_pretrained(
+    "himmeow/vi-gemma-2b-RAG",
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+# Use GPU if available
+if torch.cuda.is_available():
+    model.to("cuda")
+# Streamlit app layout
+st.set_page_config(page_title="📄 PDF Query App", page_icon=":book:", layout="wide")
+st.title("📄 PDF Query App")
+st.sidebar.title("Upload File and Query")
+# Sidebar: File Upload
+uploaded_file = st.sidebar.file_uploader("Upload your PDF file", type="pdf")
+# Sidebar: Query Input
+query = st.sidebar.text_input("Enter your query:")
+# Handle file upload
+if uploaded_file and query:
+    # Read the PDF file
+    pdf_text = ""
+    with open(uploaded_file, "rb") as file:
+        reader = PdfReader(file)
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            text = page.extract_text()
+            pdf_text += text + "\n"
+    # Define the prompt format for the model
+    prompt = """
+    ### Instruction and Input:
+    Based on the following context/document:
+    {}
+    Please answer the question: {}
+    ### Response:
+    {}
+    """
+    # Format the input text
+    input_text = prompt.format(pdf_text, query, " ")
+    # Encode the input text into input ids
+    input_ids = tokenizer(input_text, return_tensors="pt")
+    # Use GPU for input ids if available
+    if torch.cuda.is_available():
+        input_ids = input_ids.to("cuda")
+    # Generate text using the model
+    outputs = model.generate(
+        **input_ids,
+        max_new_tokens=500,  # Limit the number of tokens generated
+        no_repeat_ngram_size=5,  # Prevent repetition of 5-gram phrases
+    )
+    # Decode and display the results
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    st.write(response)
+# Footer with LinkedIn link
+st.sidebar.write("---")
+st.sidebar.write("Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")