import streamlit as st import os from huggingface_hub import InferenceApi from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langdetect import detect # Load the Hugging Face token from environment variables (secrets) token = os.environ.get("KEY2") # Replace "KEY2" with your secret key name # Initialize the Hugging Face Inference API def load_llm(): model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model api = InferenceApi(repo_id=model_name, token=token) return api # Extract text from PDF def extract_text_from_pdf(file): reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text # Split text into chunks def split_text(text, chunk_size=1000, chunk_overlap=200): splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) chunks = splitter.split_text(text) return chunks # Create embeddings and vector store def create_vector_store(chunks, indexing_method="multi-representation", **kwargs): embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") if indexing_method == "multi-representation": vector_store = FAISS.from_texts(chunks, embeddings) elif indexing_method == "raptors": # Implement RAPTORS logic here (e.g., hierarchical chunking) vector_store = FAISS.from_texts(chunks, embeddings) elif indexing_method == "colbert": # Implement ColBERT logic here (e.g., contextualized embeddings) vector_store = FAISS.from_texts(chunks, embeddings) return vector_store # Query the PDF using the Hugging Face API def query_pdf(vector_store, query, api, query_method="multi-query", max_new_tokens=200, temperature=0.7, top_k=50): # Retrieve relevant chunks from the vector store docs = vector_store.similarity_search(query) context = " ".join([doc.page_content for doc in docs]) # Create a prompt for the LLM prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:" # Query the Hugging Face API response = api( inputs=prompt, parameters={ "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, }, ) return response[0]["generated_text"], docs # Detect language of the text def detect_language(text): try: return detect(text) except: return "en" # Default to English if detection fails # Streamlit App def main(): st.title("Chat with PDF") st.write("Upload a PDF and ask questions about it!") # File uploader uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file is None: st.info("Using default PDF.") uploaded_file = "default.pdf" # Add a default PDF # Step 1: Extract text and split into chunks if "text" not in st.session_state: st.session_state.text = None if "chunks" not in st.session_state: st.session_state.chunks = None if st.button("Extract Text and Split into Chunks"): st.session_state.text = extract_text_from_pdf(uploaded_file) st.session_state.chunks = split_text(st.session_state.text) st.success("Text extracted and split into chunks!") # Step 2: Create vector store if "vector_store" not in st.session_state: st.session_state.vector_store = None if st.session_state.chunks: st.subheader("Indexing Options") indexing_method = st.selectbox( "Indexing Method", ["multi-representation", "raptors", "colbert"], help="Choose how to index the PDF text." ) if st.button("Create Vector Store"): st.session_state.vector_store = create_vector_store(st.session_state.chunks, indexing_method=indexing_method) st.success("Vector store created!") # Step 3: Load LLM (Hugging Face API) if "api" not in st.session_state: st.session_state.api = None if st.session_state.vector_store: st.subheader("LLM Parameters") temperature = st.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.") top_k = st.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.") max_new_tokens = st.slider("Max New Tokens", 50, 500, 200, help="Maximum number of tokens to generate.") if st.button("Load LLM"): api = load_llm() st.session_state.api = api st.success("LLM loaded!") # Step 4: Query the PDF if st.session_state.api: st.subheader("Query Translation Options") query_method = st.selectbox( "Query Translation Method", ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"], help="Choose a method to improve query retrieval." ) query = st.text_input("Ask a question about the PDF:") if query: answer, source_docs = query_pdf( st.session_state.vector_store, query, st.session_state.api, query_method=query_method, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, ) st.write("**Answer:**", answer) st.write("**Source Text:**") for doc in source_docs: st.write(doc.page_content) if __name__ == "__main__": main()