File size: 5,614 Bytes
7b666bb
c0a164f
 
7b666bb
 
f406221
7b666bb
9d72b0b
7b666bb
c0a164f
 
 
 
 
7b666bb
c0a164f
 
7b666bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13f8dc4
7b666bb
9d72b0b
 
 
 
 
 
 
 
7b666bb
 
c0a164f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b666bb
9d72b0b
 
 
 
 
 
 
7b666bb
 
 
 
 
 
 
 
 
 
 
13f8dc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0a164f
 
 
13f8dc4
 
 
 
 
 
 
c0a164f
 
13f8dc4
 
 
c0a164f
13f8dc4
 
 
 
 
 
 
 
c0a164f
 
 
 
 
 
 
 
 
 
 
 
 
7b666bb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import streamlit as st
import os
from huggingface_hub import InferenceApi
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langdetect import detect

# Load the Hugging Face token from environment variables (secrets)
token = os.environ.get("KEY2")  # Replace "KEY2" with your secret key name

# Initialize the Hugging Face Inference API
def load_llm():
    model_name = "HuggingFaceH4/zephyr-7b-alpha"  # Replace with your preferred model
    api = InferenceApi(repo_id=model_name, token=token)
    return api

# Extract text from PDF
def extract_text_from_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    return chunks

# Create embeddings and vector store
def create_vector_store(chunks, indexing_method="multi-representation", **kwargs):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    if indexing_method == "multi-representation":
        vector_store = FAISS.from_texts(chunks, embeddings)
    elif indexing_method == "raptors":
        # Implement RAPTORS logic here (e.g., hierarchical chunking)
        vector_store = FAISS.from_texts(chunks, embeddings)
    elif indexing_method == "colbert":
        # Implement ColBERT logic here (e.g., contextualized embeddings)
        vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

# Query the PDF using the Hugging Face API
def query_pdf(vector_store, query, api, query_method="multi-query", max_new_tokens=200, temperature=0.7, top_k=50):
    # Retrieve relevant chunks from the vector store
    docs = vector_store.similarity_search(query)
    context = " ".join([doc.page_content for doc in docs])

    # Create a prompt for the LLM
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"

    # Query the Hugging Face API
    response = api(
        inputs=prompt,
        parameters={
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_k": top_k,
        },
    )
    return response[0]["generated_text"], docs

# Detect language of the text
def detect_language(text):
    try:
        return detect(text)
    except:
        return "en"  # Default to English if detection fails

# Streamlit App
def main():
    st.title("Chat with PDF")
    st.write("Upload a PDF and ask questions about it!")

    # File uploader
    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
    if uploaded_file is None:
        st.info("Using default PDF.")
        uploaded_file = "default.pdf"  # Add a default PDF

    # Step 1: Extract text and split into chunks
    if "text" not in st.session_state:
        st.session_state.text = None
    if "chunks" not in st.session_state:
        st.session_state.chunks = None

    if st.button("Extract Text and Split into Chunks"):
        st.session_state.text = extract_text_from_pdf(uploaded_file)
        st.session_state.chunks = split_text(st.session_state.text)
        st.success("Text extracted and split into chunks!")

    # Step 2: Create vector store
    if "vector_store" not in st.session_state:
        st.session_state.vector_store = None

    if st.session_state.chunks:
        st.subheader("Indexing Options")
        indexing_method = st.selectbox(
            "Indexing Method",
            ["multi-representation", "raptors", "colbert"],
            help="Choose how to index the PDF text."
        )
        if st.button("Create Vector Store"):
            st.session_state.vector_store = create_vector_store(st.session_state.chunks, indexing_method=indexing_method)
            st.success("Vector store created!")

    # Step 3: Load LLM (Hugging Face API)
    if "api" not in st.session_state:
        st.session_state.api = None

    if st.session_state.vector_store:
        st.subheader("LLM Parameters")
        temperature = st.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.")
        top_k = st.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.")
        max_new_tokens = st.slider("Max New Tokens", 50, 500, 200, help="Maximum number of tokens to generate.")
        if st.button("Load LLM"):
            api = load_llm()
            st.session_state.api = api
            st.success("LLM loaded!")

    # Step 4: Query the PDF
    if st.session_state.api:
        st.subheader("Query Translation Options")
        query_method = st.selectbox(
            "Query Translation Method",
            ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
            help="Choose a method to improve query retrieval."
        )
        query = st.text_input("Ask a question about the PDF:")
        if query:
            answer, source_docs = query_pdf(
                st.session_state.vector_store,
                query,
                st.session_state.api,
                query_method=query_method,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=top_k,
            )
            st.write("**Answer:**", answer)
            st.write("**Source Text:**")
            for doc in source_docs:
                st.write(doc.page_content)

if __name__ == "__main__":
    main()