File size: 3,291 Bytes
99bb0bf
 
 
 
 
 
 
d2b120e
99bb0bf
 
 
 
8f62e8c
99bb0bf
 
47cec7b
d2b120e
99bb0bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47cec7b
99bb0bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import os
from groq import Groq
import fitz  # PyMuPDF for PDF parsing
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer  # Hugging Face transformer
from io import BytesIO  # To handle file upload correctly

# Initialize the Hugging Face model and Groq API client
model = SentenceTransformer('all-MiniLM-L6-v2')  # Model for generating embeddings
GROQ_API_KEY = "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"
client = Groq(api_key=GROQ_API_KEY)

# Function to extract text from a PDF
def extract_text_from_pdf(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")  # Use the stream and specify file type
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to generate embeddings using Hugging Face model (for text retrieval)
def generate_huggingface_embeddings(text):
    embeddings = model.encode(text)  # Using the SentenceTransformer model
    return embeddings

# Function to get relevant chunks from the document using FAISS similarity search
def get_relevant_chunks(query, top_k=5):
    query_embedding = generate_huggingface_embeddings(query)  # Get query embedding
    query_embedding = np.array(query_embedding).reshape(1, -1)  # Reshape for FAISS
    
    # Perform similarity search in FAISS
    distances, indices = index.search(query_embedding, top_k)
    relevant_chunks = [document_chunks[i] for i in indices[0]]
    return relevant_chunks

# Function to generate an answer based on retrieved context and Groq's model
def generate_answer(query):
    relevant_chunks = get_relevant_chunks(query)
    context = " ".join(relevant_chunks)  # Combine the most relevant chunks

    # Generate the response with Groq's chat model
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": f"Answer based on this: {context}"}],
        model="llama3-8b-8192",  # Adjust with the appropriate Groq model
        stream=False
    )
    return chat_completion.choices[0].message.content

# Streamlit app interface
st.title("Knowledge-Based Assistant")
st.write("Upload a PDF to generate answers based on its content.")

# Upload PDF file
pdf_file = st.file_uploader("Choose a PDF file", type="pdf")

if pdf_file is not None:
    # Extract the text content from the uploaded PDF
    document_text = extract_text_from_pdf(pdf_file)

    # Split the document into chunks (adjust chunk size as needed)
    chunk_size = 1000  # Size of each chunk of text for embedding
    document_chunks = [document_text[i:i+chunk_size] for i in range(0, len(document_text), chunk_size)]

    # Generate embeddings for each chunk and store them
    embeddings = [generate_huggingface_embeddings(chunk) for chunk in document_chunks]

    # Convert embeddings to numpy arrays for FAISS
    embeddings_array = np.array(embeddings)

    # Initialize FAISS index
    index = faiss.IndexFlatL2(embeddings_array.shape[1])  # L2 distance metric

    # Add embeddings to the FAISS index
    index.add(embeddings_array)

    # Query input from user
    query = st.text_input("Ask a question about the document:")

    if query:
        # Generate the answer based on the query
        answer = generate_answer(query)
        st.write("Answer: ", answer)