Spaces:
Sleeping
Sleeping
File size: 5,614 Bytes
7b666bb c0a164f 7b666bb f406221 7b666bb 9d72b0b 7b666bb c0a164f 7b666bb c0a164f 7b666bb 13f8dc4 7b666bb 9d72b0b 7b666bb c0a164f 7b666bb 9d72b0b 7b666bb 13f8dc4 c0a164f 13f8dc4 c0a164f 13f8dc4 c0a164f 13f8dc4 c0a164f 7b666bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import streamlit as st
import os
from huggingface_hub import InferenceApi
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langdetect import detect
# Load the Hugging Face token from environment variables (secrets)
token = os.environ.get("KEY2") # Replace "KEY2" with your secret key name
# Initialize the Hugging Face Inference API
def load_llm():
model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
api = InferenceApi(repo_id=model_name, token=token)
return api
# Extract text from PDF
def extract_text_from_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=200):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = splitter.split_text(text)
return chunks
# Create embeddings and vector store
def create_vector_store(chunks, indexing_method="multi-representation", **kwargs):
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
if indexing_method == "multi-representation":
vector_store = FAISS.from_texts(chunks, embeddings)
elif indexing_method == "raptors":
# Implement RAPTORS logic here (e.g., hierarchical chunking)
vector_store = FAISS.from_texts(chunks, embeddings)
elif indexing_method == "colbert":
# Implement ColBERT logic here (e.g., contextualized embeddings)
vector_store = FAISS.from_texts(chunks, embeddings)
return vector_store
# Query the PDF using the Hugging Face API
def query_pdf(vector_store, query, api, query_method="multi-query", max_new_tokens=200, temperature=0.7, top_k=50):
# Retrieve relevant chunks from the vector store
docs = vector_store.similarity_search(query)
context = " ".join([doc.page_content for doc in docs])
# Create a prompt for the LLM
prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
# Query the Hugging Face API
response = api(
inputs=prompt,
parameters={
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_k": top_k,
},
)
return response[0]["generated_text"], docs
# Detect language of the text
def detect_language(text):
try:
return detect(text)
except:
return "en" # Default to English if detection fails
# Streamlit App
def main():
st.title("Chat with PDF")
st.write("Upload a PDF and ask questions about it!")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is None:
st.info("Using default PDF.")
uploaded_file = "default.pdf" # Add a default PDF
# Step 1: Extract text and split into chunks
if "text" not in st.session_state:
st.session_state.text = None
if "chunks" not in st.session_state:
st.session_state.chunks = None
if st.button("Extract Text and Split into Chunks"):
st.session_state.text = extract_text_from_pdf(uploaded_file)
st.session_state.chunks = split_text(st.session_state.text)
st.success("Text extracted and split into chunks!")
# Step 2: Create vector store
if "vector_store" not in st.session_state:
st.session_state.vector_store = None
if st.session_state.chunks:
st.subheader("Indexing Options")
indexing_method = st.selectbox(
"Indexing Method",
["multi-representation", "raptors", "colbert"],
help="Choose how to index the PDF text."
)
if st.button("Create Vector Store"):
st.session_state.vector_store = create_vector_store(st.session_state.chunks, indexing_method=indexing_method)
st.success("Vector store created!")
# Step 3: Load LLM (Hugging Face API)
if "api" not in st.session_state:
st.session_state.api = None
if st.session_state.vector_store:
st.subheader("LLM Parameters")
temperature = st.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.")
top_k = st.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.")
max_new_tokens = st.slider("Max New Tokens", 50, 500, 200, help="Maximum number of tokens to generate.")
if st.button("Load LLM"):
api = load_llm()
st.session_state.api = api
st.success("LLM loaded!")
# Step 4: Query the PDF
if st.session_state.api:
st.subheader("Query Translation Options")
query_method = st.selectbox(
"Query Translation Method",
["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
help="Choose a method to improve query retrieval."
)
query = st.text_input("Ask a question about the PDF:")
if query:
answer, source_docs = query_pdf(
st.session_state.vector_store,
query,
st.session_state.api,
query_method=query_method,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
)
st.write("**Answer:**", answer)
st.write("**Source Text:**")
for doc in source_docs:
st.write(doc.page_content)
if __name__ == "__main__":
main() |