import os
import streamlit as st
import fitz  # PyMuPDF
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader

# --- Configuration ---
st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
st.title("📚 RAG-based PDF Chatbot")
device = "cpu"

# --- Logging ---
logging.basicConfig(level=logging.INFO)

# --- Load LLM ---
@st.cache_resource
def load_model():
    checkpoint = "MBZUAI/LaMini-T5-738M"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
    return HuggingFacePipeline(pipeline=pipe)

# --- Extract PDF Text ---
def read_pdf(file):
    try:
        doc = fitz.open(stream=file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        return text.strip()
    except Exception as e:
        logging.error(f"Failed to extract text: {e}")
        return ""

# --- Process Answer ---dd
def process_answer(question, full_text):
    # Save the full_text to a temporary file
    with open("temp_text.txt", "w") as f:
        f.write(full_text)

    loader = TextLoader("temp_text.txt")
    docs = loader.load()

    # Chunk the documents with increased size and overlap
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
    splits = text_splitter.split_documents(docs)

    # Load embeddings
    embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    # Create Chroma in-memory vector store
    db = Chroma.from_documents(splits, embedding=embeddings)
    retriever = db.as_retriever()

    # Set up the model
    llm = load_model()

    # Create a custom prompt
    prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
     
    Context:
    {context}
     
    Question:
    {question}
     
    Important Instructions:
    - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
    - Do NOT summarize or paraphrase.
    - If the information is not in the context, say "Not found in the document."
     
    Answer:
    """)
     

    # Retrieval QA with custom prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt_template}
    )

    # Return the answer using the retrieval QA chain
    return qa_chain.run(question)

# --- UI Layout ---
with st.sidebar:
    st.header("📄 Upload PDF")
    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])

# --- Main Interface ---
if uploaded_file:
    st.success(f"You uploaded: {uploaded_file.name}")
    full_text = read_pdf(uploaded_file)

    if full_text:
        st.subheader("📁 PDF Preview")
        with st.expander("View Extracted Text"):
            st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

        st.subheader("💬 Ask a Question")
        user_question = st.text_input("Type your question about the PDF content")

        if user_question:
            with st.spinner("Thinking..."):
                answer = process_answer(user_question, full_text)
                st.markdown("### 🤖 Answer")
                st.write(answer)

        with st.sidebar:
            st.markdown("---")
            st.markdown("**💡 Suggestions:**")
            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") 
        with st.expander("💡 Suggestions", expanded=True):
            st.markdown("""
            - "Summarize this document"
            - "Give a quick summary"
            - "What are the main points?"
            - "Explain this document in short"
            """)

    else:
        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
else:
    st.info("Upload a PDF to begin.")


# import os
# import streamlit as st
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain.chains import RetrievalQA
# from langchain.prompts import PromptTemplate
# from langchain.llms import HuggingFaceHub

# # Set your Hugging Face API token here
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"

# # Load and split PDF
# def load_and_split_pdf(uploaded_file):
#     with open("temp.pdf", "wb") as f:
#         f.write(uploaded_file.read())
#     loader = PyPDFLoader("temp.pdf")
#     documents = loader.load()

#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
#     chunks = text_splitter.split_documents(documents)
#     return chunks

# # Build vectorstore
# def build_vectorstore(chunks):
#     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
#     return vectorstore

# # Load Lamini or other HF model
# def get_llm():
#     return HuggingFaceHub(
#         repo_id="lamini/lamini-13b-chat",
#         model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
#     )

# # Create prompt template (optional for better accuracy)
# custom_prompt = PromptTemplate(
#     input_variables=["context", "question"],
#     template="""
# You are a helpful assistant. Use the following context to answer the question as accurately as possible.
# If the answer is not in the context, respond with "Not found in the document."

# Context:
# {context}

# Question: {question}

# Answer:"""
# )

# # Build QA chain
# def build_qa_chain(vectorstore):
#     llm = get_llm()
#     qa_chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
#         chain_type_kwargs={"prompt": custom_prompt}
#     )
#     return qa_chain

# # Streamlit UI
# def main():
#     st.set_page_config(page_title="PDF Chatbot", layout="wide")
#     st.title("Chat with your PDF")

#     uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

#     if uploaded_file:
#         st.success("PDF uploaded successfully!")
#         with st.spinner("Processing PDF..."):
#             chunks = load_and_split_pdf(uploaded_file)
#             vectorstore = build_vectorstore(chunks)
#             qa_chain = build_qa_chain(vectorstore)
#         st.success("Ready to chat!")

#         user_question = st.text_input("Ask a question based on the PDF:")
#         if user_question:
#             with st.spinner("Generating answer..."):
#                 result = qa_chain.run(user_question)
#                 st.markdown("**Answer:**")
#                 st.write(result)

# if __name__ == "__main__":
#     main()